add_executable(unittest_ecutil
TestECUtil.cc
$<TARGET_OBJECTS:unit-main>
+ $<TARGET_OBJECTS:erasure_code_objs>
)
add_ceph_unittest(unittest_ecutil)
target_link_libraries(unittest_ecutil osd global)
)
add_ceph_unittest(unittest_peeringstate)
target_link_libraries(unittest_peeringstate osd os global ${CMAKE_DL_LIBS} ${BLKID_LIBRARIES})
+# pg_backend_test_fixture: object library for PGBackendTestFixture implementation
+add_library(pg_backend_test_fixture OBJECT
+ PGBackendTestFixture.cc
+)
+target_link_libraries(pg_backend_test_fixture osd os global)
+
+# ec_peering_test_fixture: object library for ECPeeringTestFixture implementation
+add_library(ec_peering_test_fixture OBJECT
+ ECPeeringTestFixture.cc
+)
+target_link_libraries(ec_peering_test_fixture osd os global)
+
+# unittest_backend_basics (replaces unittest_ecbasics + unittest_replicatedbasics)
+add_executable(unittest_backend_basics
+ TestBackendBasics.cc
+ $<TARGET_OBJECTS:unit-main>
+ $<TARGET_OBJECTS:pg_backend_test_fixture>
+ )
+add_ceph_unittest(unittest_backend_basics)
+target_link_libraries(unittest_backend_basics osd os global ${CMAKE_DL_LIBS} ${BLKID_LIBRARIES})
+add_dependencies(unittest_backend_basics ec_isa ec_jerasure)
+# unittest_ecfailover_with_peering
+add_executable(unittest_ecfailover_with_peering
+ TestECFailoverWithPeering.cc
+ $<TARGET_OBJECTS:unit-main>
+ $<TARGET_OBJECTS:store_test_fixture>
+ $<TARGET_OBJECTS:pg_backend_test_fixture>
+ $<TARGET_OBJECTS:ec_peering_test_fixture>
+ )
+add_ceph_unittest(unittest_ecfailover_with_peering)
+target_link_libraries(unittest_ecfailover_with_peering osd os global ${CMAKE_DL_LIBS} ${BLKID_LIBRARIES})
+add_dependencies(unittest_ecfailover_with_peering ec_isa)
# unittest_hitset
add_executable(unittest_hitset
hitset.cc
target_link_libraries(unittest_mclock_scheduler
global osd dmclock os
)
+
+# osd_unittests: custom target that builds and runs all OSD unit tests
+# Not including unittest_osdmap, as it is slow. It is tested elsewhere.
+set(OSD_UNITTESTS
+ unittest_backend_basics
+ unittest_ec_transaction
+ unittest_ec_transaction_l
+ unittest_ecbackend
+ unittest_ecbackend_l
+ unittest_ecfailover_with_peering
+ unittest_ecutil
+ unittest_extent_cache
+ unittest_extent_cache_l
+ unittest_hitset
+ unittest_mclock_scheduler
+ unittest_osd_osdcap
+ unittest_osd_types
+ unittest_osdscrub
+ unittest_peeringstate
+ unittest_pg_transaction
+ unittest_pglog
+ unittest_scrubber_be
+)
+
+# osd_unittests: build all OSD unit tests and run them via ctest.
+# This is for development convenience only, it is not used as part of make
+# check.
+# Using ctest ensures:
+# - All tests run even if one fails (--no-tests-on-failure continues)
+# - Output is shown for failing tests (--output-on-failure)
+# - Adding a new test only requires adding it to OSD_UNITTESTS above
+# - Excludes unittest_osdmap because it is relatively slow.
+string(JOIN "|" OSD_UNITTEST_REGEX ${OSD_UNITTESTS})
+add_custom_target(osd_unittests
+ DEPENDS ${OSD_UNITTESTS}
+ COMMAND ${CMAKE_CTEST_COMMAND}
+ --test-dir ${CMAKE_BINARY_DIR}
+ -R "^(${OSD_UNITTEST_REGEX})$"
+ --output-on-failure
+ WORKING_DIRECTORY ${CMAKE_BINARY_DIR}
+ COMMENT "Building and running all OSD unit tests via ctest"
+ VERBATIM
+)
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 sts=2 expandtab
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2026 IBM
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "test/osd/ECPeeringTestFixture.h"
+
+PeeringState* ECPeeringTestFixture::create_peering_state(int shard)
+{
+ const pg_pool_t& pi = get_pool();
+ pg_shard_t pg_whoami(shard, shard_id_t(shard));
+ PGPool pool(osdmap, pool_id, pi, "test_pool");
+
+ shard_dpps[shard] = std::make_unique<ShardDpp>(g_ceph_context, this, shard);
+
+ shard_peering_listeners[shard] = std::make_unique<MockPeeringListener>(
+ osdmap, pool_id, shard_dpps[shard].get(), pg_whoami);
+ shard_peering_listeners[shard]->current_epoch = osdmap->get_epoch();
+
+ shard_peering_listeners[shard]->queue_transaction_callback =
+ [this, shard](ObjectStore::Transaction&& t) -> int {
+ return queue_transaction_helper(shard, std::move(t));
+ };
+
+ // Transfer ownership of the backend listener from the base class listeners[]
+ // map into the peering listener. The factory (set in our constructor) already
+ // recorded a raw pointer in backend_listeners[] so we know which entry to move.
+ // After the move, listeners[shard] holds a null unique_ptr; TearDown() already
+ // guards against that with "if (list)".
+ shard_peering_listeners[shard]->backend_listener = std::move(listeners[shard]);
+ shard_peering_listeners[shard]->coll = colls[shard];
+ shard_peering_listeners[shard]->ch = chs[shard];
+
+ // Recreate backend with the correct backend_listener pointer.
+ // The MockPeeringListener constructor created backend with the temporary
+ // backend_listener it allocated internally, but we just replaced backend_listener
+ // with the one from the base class listeners[] map. We must recreate backend
+ // so its parent pointer points to the new backend_listener, not the destroyed one.
+ shard_peering_listeners[shard]->backend = std::make_unique<MockPGBackend>(
+ g_ceph_context,
+ shard_peering_listeners[shard]->backend_listener.get(),
+ nullptr,
+ colls[shard],
+ chs[shard]);
+
+ spg_t spgid(pgid, shard_id_t(shard));
+ auto ps = std::make_unique<PeeringState>(
+ g_ceph_context,
+ pg_whoami,
+ spgid,
+ pool,
+ osdmap,
+ PG_FEATURE_CLASSIC_ALL,
+ shard_dpps[shard].get(),
+ shard_peering_listeners[shard].get());
+
+ shard_peering_listeners[shard]->ps = ps.get();
+
+ ps->set_backend_predicates(
+ get_is_readable_predicate(),
+ get_is_recoverable_predicate());
+
+ shard_peering_states[shard] = std::move(ps);
+ shard_peering_listeners[shard]->backend_listener->set_peering_state(shard_peering_states[shard].get());
+ shard_peering_ctxs[shard] = std::make_unique<PeeringCtx>();
+
+ return shard_peering_states[shard].get();
+}
+
+void ECPeeringTestFixture::init_peering(bool dne)
+{
+ pg_history_t history;
+ history.same_interval_since = osdmap->get_epoch();
+ history.epoch_pool_created = osdmap->get_epoch();
+ history.last_epoch_clean = osdmap->get_epoch();
+ if (!dne) {
+ history.epoch_created = osdmap->get_epoch();
+ }
+ PastIntervals past_intervals;
+
+ // Get primary from OSDMap using base class pgid member
+ std::vector<int> up_osds, acting_osds;
+ int up_primary = -1, acting_primary = -1;
+ osdmap->pg_to_up_acting_osds(this->pgid, &up_osds, &up_primary, &acting_osds, &acting_primary);
+
+ for (int shard : acting_osds) {
+ ObjectStore::Transaction t;
+ get_peering_state(shard)->init(
+ (shard == acting_primary) ? 0 : 1, // role
+ up_osds,
+ up_primary,
+ acting_osds,
+ acting_primary,
+ history,
+ past_intervals,
+ t);
+
+ queue_transaction_helper(shard, std::move(t));
+ }
+}
+
+void ECPeeringTestFixture::update_osdmap_with_peering(
+ std::shared_ptr<OSDMap> new_osdmap,
+ std::optional<pg_shard_t> new_primary)
+{
+ OSDMapRef old_osdmap = osdmap;
+
+ update_osdmap(new_osdmap, new_primary);
+
+ // Update peering listeners for ALL shards (even failed ones need epoch updates)
+ for (auto& [shard, listener] : shard_peering_listeners) {
+ listener->current_epoch = new_osdmap->get_epoch();
+ }
+
+ // Get primary from OSDMap for advance_map calls using base class pgid member
+ std::vector<int> up_osds, acting_osds;
+ int up_primary = -1, acting_primary = -1;
+ osdmap->pg_to_up_acting_osds(this->pgid, &up_osds, &up_primary, &acting_osds, &acting_primary);
+
+ // Call advance_map on ALL shards that have peering states, including failed ones
+ // This ensures that failed OSDs are notified of map changes (e.g., primary failover)
+ // Use the newly computed up_osds and acting_osds from the new OSDMap
+ for (auto& [shard, ps] : shard_peering_states) {
+ ps->advance_map(
+ osdmap, old_osdmap, up_osds, up_primary, acting_osds, acting_primary,
+ *get_peering_ctx(shard));
+ }
+
+ // Call activate_map on ALL shards that have peering states
+ // This ensures failed OSDs properly transition state and notify their backends
+ for (auto& [shard, ps] : shard_peering_states) {
+ ps->activate_map(*get_peering_ctx(shard));
+ }
+
+ dispatch_all();
+
+ // Handle up_thru requirements - keep creating new epochs until peering completes.
+ // Note: For primary failover scenarios, full peering may not complete immediately.
+ int max_iterations = 3;
+ do {
+ event_advance_map();
+ event_activate_map();
+ } while (new_epoch(true) && --max_iterations);
+}
+
+bool ECPeeringTestFixture::new_epoch(bool if_required)
+{
+ bool did_work = false;
+ epoch_t e = osdmap->get_epoch();
+ OSDMap::Incremental pending_inc(e + 1);
+ pending_inc.fsid = osdmap->get_fsid();
+
+ // Get acting set from OSDMap
+ std::vector<int> acting_osds;
+ int acting_primary = -1;
+ osdmap->pg_to_acting_osds(this->pgid, &acting_osds, &acting_primary);
+
+ for (int shard : acting_osds) {
+ // Skip failed OSDs (marked as CRUSH_ITEM_NONE)
+ if (shard == CRUSH_ITEM_NONE) {
+ continue;
+ }
+ if (get_peering_state(shard)->get_need_up_thru()) {
+ pending_inc.new_up_thru[shard] = e;
+ did_work = true;
+ }
+ }
+
+ if (acting_primary >= 0) {
+ auto& listener = shard_peering_listeners[acting_primary];
+ if (listener->pg_temp_wanted) {
+ // Get up set from OSDMap
+ std::vector<int> up_osds;
+ int up_primary = -1;
+ osdmap->pg_to_up_acting_osds(this->pgid, &up_osds, &up_primary, nullptr, nullptr);
+
+ std::vector<int> acting_temp = listener->next_acting;
+ if (acting_temp.empty()) {
+ acting_temp = up_osds;
+ }
+
+ // Apply the pg_temp change that peering requested.
+ // For EC pools with optimizations, transform to primaryfirst order
+ // (this simulates what the monitor does in production).
+ const pg_pool_t* pool = osdmap->get_pg_pool(this->pgid.pool());
+ std::vector<int> pg_temp_acting = acting_temp;
+ if (pool && pool->allows_ecoptimizations()) {
+ pg_temp_acting = osdmap->pgtemp_primaryfirst(*pool, acting_temp);
+ }
+
+ pending_inc.new_pg_temp[this->pgid] =
+ mempool::osdmap::vector<int>(pg_temp_acting.begin(), pg_temp_acting.end());
+
+ listener->pg_temp_wanted = false;
+ did_work = true;
+ }
+ }
+
+ if (!did_work && if_required) {
+ return false;
+ }
+
+ osdmap->apply_incremental(pending_inc);
+
+ for (auto& [shard, listener] : shard_peering_listeners) {
+ listener->current_epoch = osdmap->get_epoch();
+ }
+
+ return true;
+}
+
+void ECPeeringTestFixture::run_peering_cycle()
+{
+ init_peering();
+ event_initialize();
+ dispatch_all();
+ event_advance_map();
+ dispatch_all();
+ event_activate_map();
+ dispatch_all();
+
+ // Handle up_thru requirements - keep creating new epochs until peering completes.
+ int max_iterations = 10;
+ for (int i = 0; i < max_iterations && !all_shards_active(); i++) {
+ if (new_epoch(true)) {
+ event_advance_map();
+ dispatch_all();
+ event_activate_map();
+ dispatch_all();
+ }
+ }
+}
+
+int ECPeeringTestFixture::queue_transaction_helper(int shard, ObjectStore::Transaction&& t)
+{
+ if (t.empty()) {
+ return 0;
+ }
+
+ // Note: Contexts are stolen by MockPGBackendListener::queue_transaction,
+ // so we don't need to call execute_finishers here
+ int result = store->queue_transaction(chs[shard], std::move(t));
+
+ return result;
+}
+
+void ECPeeringTestFixture::mark_osd_down(int osd_id)
+{
+ // Create new OSDMap with the OSD marked as down
+ // This emulates what the real monitor does: just mark the OSD down,
+ // do NOT set pg_temp. Peering will detect the change and request pg_temp.
+ auto new_osdmap = std::make_shared<OSDMap>();
+ new_osdmap->deepish_copy_from(*osdmap);
+ OSDMapTestHelpers::mark_osd_down(new_osdmap, osd_id);
+
+ update_osdmap_with_peering(new_osdmap);
+ dispatch_all();
+
+ // Process any pg_temp requests from peering (emulates monitor processing MOSDPGTemp)
+ // This will apply the primaryfirst transformation if needed
+ if (new_epoch(false)) {
+ event_advance_map();
+ dispatch_all();
+ }
+}
+
+void ECPeeringTestFixture::mark_osd_up(int osd_id)
+{
+ // Create new OSDMap with the OSD marked as up using OSDMapTestHelpers
+ auto new_osdmap = std::make_shared<OSDMap>();
+ new_osdmap->deepish_copy_from(*osdmap);
+ OSDMapTestHelpers::mark_osd_up(new_osdmap, osd_id);
+
+ update_osdmap_with_peering(new_osdmap);
+ dispatch_all();
+}
+
+void ECPeeringTestFixture::mark_osds_down(const std::vector<int>& osd_ids)
+{
+ // Create new OSDMap with all OSDs marked as down using OSDMapTestHelpers
+ auto new_osdmap = std::make_shared<OSDMap>();
+ new_osdmap->deepish_copy_from(*osdmap);
+ OSDMapTestHelpers::mark_osds_down(new_osdmap, osd_ids);
+
+ update_osdmap_with_peering(new_osdmap);
+ dispatch_all();
+}
+
+void ECPeeringTestFixture::advance_epoch()
+{
+ auto new_osdmap = std::make_shared<OSDMap>();
+ new_osdmap->deepish_copy_from(*osdmap);
+ OSDMapTestHelpers::advance_epoch(new_osdmap);
+
+ update_osdmap_with_peering(new_osdmap);
+ dispatch_all();
+}
+
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 sts=2 expandtab
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2026 IBM
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include <memory>
+#include <map>
+#include <vector>
+#include "test/osd/PGBackendTestFixture.h"
+#include "test/osd/MockPeeringListener.h"
+#include "test/osd/MockConnection.h"
+#include "test/osd/MockECRecPred.h"
+#include "test/osd/MockECReadPred.h"
+#include "test/osd/OSDMapTestHelpers.h"
+#include "osd/PeeringState.h"
+#include "messages/MOSDPeeringOp.h"
+
+/**
+ * ECPeeringTestFixture - EC test fixture with full peering infrastructure
+ *
+ * This fixture extends PGBackendTestFixture to add full PeeringState support
+ * for each shard, enabling comprehensive testing of EC peering, recovery,
+ * and failover scenarios. It combines the principles from TestPeeringState
+ * with the EC backend infrastructure from PGBackendTestFixture.
+ */
+class ECPeeringTestFixture : public PGBackendTestFixture {
+protected:
+ std::map<int, std::unique_ptr<PeeringState>> shard_peering_states;
+ std::map<int, std::unique_ptr<PeeringCtx>> shard_peering_ctxs;
+ std::map<int, std::unique_ptr<MockPeeringListener>> shard_peering_listeners;
+
+ std::map<int, std::list<MessageRef>> shard_messages;
+ std::map<int, std::list<PGPeeringEventRef>> shard_events;
+
+ // Raw-pointer map giving this fixture direct access to the backend listeners
+ // created by the listener_factory. The pointers are valid for the lifetime
+ // of the test because ownership is transferred to
+ // shard_peering_listeners[i]->backend_listener in create_peering_state().
+ std::map<int, MockPGBackendListener*> backend_listeners;
+
+ class ShardDpp : public NoDoutPrefix {
+ public:
+ ECPeeringTestFixture *fixture;
+ int shard;
+
+ ShardDpp(CephContext *cct, ECPeeringTestFixture *f, int s)
+ : NoDoutPrefix(cct, ceph_subsys_osd), fixture(f), shard(s) {}
+
+ std::ostream& gen_prefix(std::ostream& out) const override {
+ out << "shard " << shard << ": ";
+ if (fixture->shard_peering_states.contains(shard)) {
+ PeeringState *ps = fixture->shard_peering_states[shard].get();
+ out << *ps << " ";
+ }
+ return out;
+ }
+ };
+ std::map<int, std::unique_ptr<ShardDpp>> shard_dpps;
+
+ IsPGRecoverablePredicate *get_is_recoverable_predicate() {
+ return new MockECRecPred(k, m);
+ }
+
+ IsPGReadablePredicate *get_is_readable_predicate() {
+ return new MockECReadPred(k, m);
+ }
+
+public:
+ ECPeeringTestFixture() : PGBackendTestFixture(PGBackendTestFixture::EC) {
+ // Install a listener_factory so that setup_ec_pool() creates listeners
+ // that we can access directly (via backend_listeners[]) without needing
+ // to steal ownership via release_listener().
+ //
+ // The factory records a raw pointer in backend_listeners[instance] and
+ // returns the unique_ptr to the base class, which stores it in listeners[].
+ // In create_peering_state() we then move that unique_ptr from listeners[]
+ // into shard_peering_listeners[]->backend_listener, at which point the
+ // raw pointer in backend_listeners[] remains valid (owned by the peering
+ // listener for the rest of the test).
+ listener_factory = [this](
+ int instance,
+ std::shared_ptr<OSDMap> om,
+ int64_t pool_id,
+ DoutPrefixProvider* dpp_arg,
+ pg_shard_t whoami) -> std::unique_ptr<MockPGBackendListener>
+ {
+ auto bl = std::make_unique<MockPGBackendListener>(
+ om, pool_id, dpp_arg, whoami);
+ // Record raw pointer so tests can access the listener directly
+ backend_listeners[instance] = bl.get();
+ return bl;
+ };
+ }
+
+ void SetUp() override {
+ PGBackendTestFixture::SetUp();
+ for (int i = 0; i < k + m; i++) {
+ create_peering_state(i);
+ }
+ }
+
+ void TearDown() override {
+ shard_peering_states.clear();
+ shard_peering_ctxs.clear();
+ shard_peering_listeners.clear();
+ shard_dpps.clear();
+ shard_messages.clear();
+ shard_events.clear();
+ PGBackendTestFixture::TearDown();
+ }
+
+ PeeringState* create_peering_state(int shard);
+
+ PeeringState* get_peering_state(int shard) {
+ ceph_assert(shard >= 0 && shard < k + m);
+ auto it = shard_peering_states.find(shard);
+ ceph_assert(it != shard_peering_states.end());
+ ceph_assert(it->second != nullptr);
+ return it->second.get();
+ }
+
+ PeeringCtx* get_peering_ctx(int shard) {
+ ceph_assert(shard >= 0 && shard < k + m);
+ auto it = shard_peering_ctxs.find(shard);
+ ceph_assert(it != shard_peering_ctxs.end());
+ ceph_assert(it->second != nullptr);
+ return it->second.get();
+ }
+
+ MockPeeringListener* get_peering_listener(int shard) {
+ ceph_assert(shard >= 0 && shard < k + m);
+ auto it = shard_peering_listeners.find(shard);
+ ceph_assert(it != shard_peering_listeners.end());
+ ceph_assert(it->second != nullptr);
+ return it->second.get();
+ }
+
+ /**
+ * Query the OSDMap to determine which shard is the primary.
+ * This is the authoritative source of truth for primary determination.
+ *
+ * @return The shard ID of the primary, or -1 if no primary exists
+ */
+ int get_primary_shard_from_osdmap() const {
+ std::vector<int> acting_osds;
+ int acting_primary = -1;
+ osdmap->pg_to_acting_osds(this->pgid, &acting_osds, &acting_primary);
+ return acting_primary;
+ }
+
+ // Override base class methods to work with peering fixture's structure
+ MockPGBackendListener* get_primary_listener() override {
+ int primary_shard = get_primary_shard_from_osdmap();
+ if (primary_shard < 0) {
+ return nullptr;
+ }
+
+ auto it = shard_peering_listeners.find(primary_shard);
+ if (it != shard_peering_listeners.end() && it->second &&
+ it->second->backend_listener) {
+ // Assert that the backend listener agrees it's primary
+ ceph_assert(it->second->backend_listener->pgb_is_primary());
+ return it->second->backend_listener.get();
+ }
+ return nullptr;
+ }
+
+ PGBackend* get_primary_backend() override {
+ int primary_shard = get_primary_shard_from_osdmap();
+ if (primary_shard < 0) {
+ return nullptr;
+ }
+
+ auto listener_it = shard_peering_listeners.find(primary_shard);
+ if (listener_it != shard_peering_listeners.end() && listener_it->second &&
+ listener_it->second->backend_listener) {
+ // Assert that the backend listener agrees it's primary
+ ceph_assert(listener_it->second->backend_listener->pgb_is_primary());
+
+ // Return the backend from the base class's backends map, not from
+ // the peering listener, because the base class backend is connected
+ // to the event loop and message routers
+ auto backend_it = backends.find(primary_shard);
+ return (backend_it != backends.end()) ? backend_it->second.get() : nullptr;
+ }
+ return nullptr;
+ }
+
+ void init_peering(bool dne = false);
+
+ void event_initialize() {
+ // Get acting set from OSDMap
+ std::vector<int> acting_osds;
+ int acting_primary = -1;
+ osdmap->pg_to_acting_osds(this->pgid, &acting_osds, &acting_primary);
+
+ for (int shard : acting_osds) {
+ // Skip failed OSDs (marked as CRUSH_ITEM_NONE)
+ if (shard == CRUSH_ITEM_NONE) {
+ continue;
+ }
+ auto evt = std::make_shared<PGPeeringEvent>(
+ osdmap->get_epoch(),
+ osdmap->get_epoch(),
+ PeeringState::Initialize());
+
+ get_peering_state(shard)->handle_event(evt, get_peering_ctx(shard));
+ }
+ }
+
+ void event_advance_map() {
+ // Get primary from OSDMap - query once before the loop
+ std::vector<int> up_osds, acting_osds;
+ int up_primary = -1, acting_primary = -1;
+ osdmap->pg_to_up_acting_osds(this->pgid, &up_osds, &up_primary, &acting_osds, &acting_primary);
+
+ for (int shard : acting_osds) {
+ // Skip failed OSDs (marked as CRUSH_ITEM_NONE)
+ if (shard == CRUSH_ITEM_NONE) {
+ continue;
+ }
+ get_peering_state(shard)->advance_map(
+ osdmap, osdmap, up_osds, up_primary, acting_osds, acting_primary,
+ *get_peering_ctx(shard));
+ }
+ }
+
+ void event_activate_map() {
+ // Get acting set from OSDMap - must use same set as advance_map
+ std::vector<int> up_osds, acting_osds;
+ int up_primary = -1, acting_primary = -1;
+ osdmap->pg_to_up_acting_osds(this->pgid, &up_osds, &up_primary, &acting_osds, &acting_primary);
+
+ for (int shard : acting_osds) {
+ // Skip failed OSDs (marked as CRUSH_ITEM_NONE)
+ if (shard == CRUSH_ITEM_NONE) {
+ continue;
+ }
+ get_peering_state(shard)->activate_map(*get_peering_ctx(shard));
+ }
+ }
+
+private:
+ // Dispatch all messages from a map<int, Container<MessageRef>>.
+ // Templated to work with both std::vector (PeeringCtx::message_map) and
+ // std::list (MockPeeringListener::messages).
+ template <typename Container>
+ bool dispatch_messages_from_map(int from_shard,
+ std::map<int, Container>& msg_map) {
+ bool did_work = false;
+
+ // Get acting set from OSDMap
+ std::vector<int> acting_osds;
+ int acting_primary = -1;
+ osdmap->pg_to_acting_osds(this->pgid, &acting_osds, &acting_primary);
+
+ for (auto& [to_shard, msg_list] : msg_map) {
+ if (std::find(acting_osds.begin(), acting_osds.end(), to_shard) == acting_osds.end()) {
+ continue;
+ }
+
+ while (!msg_list.empty()) {
+ MessageRef m = msg_list.front();
+ msg_list.erase(msg_list.begin());
+
+ // Cast to MOSDPeeringOp - all peering messages inherit from this.
+ // Use dynamic_cast with assertion to catch unexpected message types.
+ // Use m.get() (not m.detach()) to avoid leaking the raw pointer.
+ MOSDPeeringOp *op = dynamic_cast<MOSDPeeringOp*>(m.get());
+ ceph_assert(op != nullptr) /* message must be a MOSDPeeringOp */;
+
+ // Set connection peer to the SENDER, not the destination
+ ceph_msg_header h = op->get_header();
+ h.src.num = from_shard;
+ op->set_header(h);
+
+ ConnectionRef conn = new MockConnection(from_shard);
+ op->set_connection(conn);
+
+ // get_event() returns a newly allocated PGPeeringEvent,
+ // so we take ownership directly into a shared_ptr (matching OSD.cc pattern)
+ PGPeeringEventRef evt_ref(op->get_event());
+
+ get_peering_state(to_shard)->handle_event(
+ evt_ref,
+ get_peering_ctx(to_shard));
+
+ did_work = true;
+ }
+ }
+
+ return did_work;
+ }
+
+public:
+ bool dispatch_peering_messages(int from_shard) {
+ auto* ctx = get_peering_ctx(from_shard);
+ return dispatch_messages_from_map(from_shard, ctx->message_map);
+ }
+
+ bool dispatch_cluster_messages(int from_shard) {
+ auto& listener = shard_peering_listeners[from_shard];
+ return dispatch_messages_from_map(from_shard, listener->messages);
+ }
+
+ bool dispatch_all_peering_messages() {
+ bool did_work = false;
+ bool work_this_round;
+
+ // Get acting set from OSDMap
+ std::vector<int> acting_osds;
+ int acting_primary = -1;
+ osdmap->pg_to_acting_osds(this->pgid, &acting_osds, &acting_primary);
+
+ do {
+ work_this_round = false;
+ for (int shard : acting_osds) {
+ // Skip failed OSDs (marked as CRUSH_ITEM_NONE)
+ if (shard == CRUSH_ITEM_NONE) {
+ continue;
+ }
+ work_this_round |= dispatch_peering_messages(shard);
+ }
+ did_work |= work_this_round;
+ } while (work_this_round);
+
+ return did_work;
+ }
+
+ bool dispatch_events(int shard, bool stalled = false) {
+ auto& listener = shard_peering_listeners[shard];
+ std::list<PGPeeringEventRef>& event_queue =
+ stalled ? listener->stalled_events : listener->events;
+
+ if (event_queue.empty()) {
+ return false;
+ }
+
+ bool did_work = false;
+ while (!event_queue.empty()) {
+ PGPeeringEventRef evt = event_queue.front();
+ event_queue.pop_front();
+
+ get_peering_state(shard)->handle_event(evt, get_peering_ctx(shard));
+ did_work = true;
+ }
+
+ return did_work;
+ }
+
+ bool dispatch_all_events(bool stalled = false) {
+ bool did_work = false;
+ bool work_this_round;
+
+ // Get acting set from OSDMap
+ std::vector<int> acting_osds;
+ int acting_primary = -1;
+ osdmap->pg_to_acting_osds(this->pgid, &acting_osds, &acting_primary);
+
+ do {
+ work_this_round = false;
+ for (int shard : acting_osds) {
+ // Skip failed OSDs (marked as CRUSH_ITEM_NONE)
+ if (shard == CRUSH_ITEM_NONE) {
+ continue;
+ }
+ work_this_round |= dispatch_events(shard, stalled);
+ }
+ did_work |= work_this_round;
+ } while (work_this_round);
+
+ return did_work;
+ }
+
+ bool dispatch_all_cluster_messages() {
+ bool did_work = false;
+ bool work_this_round;
+
+ // Get acting set from OSDMap
+ std::vector<int> acting_osds;
+ int acting_primary = -1;
+ osdmap->pg_to_acting_osds(this->pgid, &acting_osds, &acting_primary);
+
+ do {
+ work_this_round = false;
+ for (int shard : acting_osds) {
+ // Skip failed OSDs (marked as CRUSH_ITEM_NONE)
+ if (shard == CRUSH_ITEM_NONE) {
+ continue;
+ }
+ work_this_round |= dispatch_cluster_messages(shard);
+ }
+ did_work |= work_this_round;
+ } while (work_this_round);
+
+ return did_work;
+ }
+
+ bool dispatch_all() {
+ bool did_work = false;
+ bool work_this_round;
+
+ do {
+ work_this_round = false;
+ work_this_round |= dispatch_all_peering_messages();
+ work_this_round |= dispatch_all_cluster_messages();
+ work_this_round |= dispatch_all_events();
+ did_work |= work_this_round;
+ } while (work_this_round);
+
+ return did_work;
+ }
+
+ // IMPORTANT: For EC pools, shard positions in acting array must be preserved.
+ // Failed OSDs should be replaced with CRUSH_ITEM_NONE, not removed.
+ void update_osdmap_with_peering(
+ std::shared_ptr<OSDMap> new_osdmap,
+ std::optional<pg_shard_t> new_primary = std::nullopt);
+
+ bool new_epoch(bool if_required = false);
+
+ int queue_transaction_helper(int shard, ObjectStore::Transaction&& t);
+
+ void run_peering_cycle();
+
+ // OSDMap manipulation helpers - these create a new epoch and trigger peering
+
+ /**
+ * Mark an OSD as down (exists but not UP).
+ * Creates a new OSDMap epoch and triggers peering.
+ */
+ void mark_osd_down(int osd_id);
+
+ /**
+ * Mark an OSD as up.
+ * Creates a new OSDMap epoch and triggers peering.
+ */
+ void mark_osd_up(int osd_id);
+
+ /**
+ * Mark multiple OSDs as down.
+ * Creates a new OSDMap epoch and triggers peering.
+ */
+ void mark_osds_down(const std::vector<int>& osd_ids);
+
+ /**
+ * Advance to a new epoch without changing OSD states.
+ * Useful for testing re-peering scenarios.
+ */
+ void advance_epoch();
+
+ bool all_shards_active() {
+ // Get acting set from OSDMap
+ std::vector<int> acting_osds;
+ int acting_primary = -1;
+ osdmap->pg_to_acting_osds(this->pgid, &acting_osds, &acting_primary);
+
+ for (int shard : acting_osds) {
+ // Skip failed OSDs (marked as CRUSH_ITEM_NONE)
+ if (shard == CRUSH_ITEM_NONE) {
+ continue;
+ }
+ if (!get_peering_state(shard)->is_active()) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ // In EC pools, only the primary tracks PG_STATE_CLEAN.
+ bool all_shards_clean() {
+ // Get primary from OSDMap
+ std::vector<int> acting_osds;
+ int acting_primary = -1;
+ osdmap->pg_to_acting_osds(this->pgid, &acting_osds, &acting_primary);
+
+ if (acting_primary >= 0 && acting_primary != CRUSH_ITEM_NONE) {
+ return get_peering_state(acting_primary)->is_clean();
+ }
+ return false;
+ }
+
+ std::string get_state_name(int shard) {
+ return get_peering_state(shard)->get_current_state();
+ }
+};
+
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 sts=2 expandtab
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2026 IBM
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include <iostream>
+#include <functional>
+#include <queue>
+#include <map>
+#include <list>
+#include <vector>
+#include "include/types.h"
+#include "messages/MOSDOp.h"
+#include "osd/OpRequest.h"
+#include "osd/PeeringState.h"
+#include "os/ObjectStore.h"
+
+/**
+ * EventLoop - Unified single-threaded event loop for OSD tests.
+ *
+ * Combines EC backend messages, ObjectStore transactions, peering messages,
+ * and peering events into a single deterministic queue. This allows tests
+ * to properly interleave peering state changes with EC backend operations.
+ */
+class EventLoop {
+public:
+ using GenericEvent = std::function<void()>;
+
+ enum class EventType {
+ GENERIC,
+ OSD_MESSAGE,
+ TRANSACTION,
+ PEERING_MESSAGE,
+ CLUSTER_MESSAGE,
+ PEERING_EVENT
+ };
+
+private:
+ struct Event {
+ EventType type;
+ int osd; // -1 for generic events
+ GenericEvent callback;
+
+ Event(EventType t, int o, GenericEvent cb)
+ : type(t), osd(o), callback(std::move(cb)) {}
+ };
+
+ std::queue<Event> event_queue;
+ bool verbose = false;
+ int events_executed = 0;
+ std::map<EventType, int> events_by_type;
+
+ static constexpr const char* event_type_name(EventType type) {
+ switch (type) {
+ case EventType::GENERIC: return "GENERIC";
+ case EventType::OSD_MESSAGE: return "OSD_MESSAGE";
+ case EventType::TRANSACTION: return "TRANSACTION";
+ case EventType::PEERING_MESSAGE: return "PEERING_MESSAGE";
+ case EventType::CLUSTER_MESSAGE: return "CLUSTER_MESSAGE";
+ case EventType::PEERING_EVENT: return "PEERING_EVENT";
+ default: return "UNKNOWN";
+ }
+ }
+
+public:
+ EventLoop(bool verbose = false) : verbose(verbose) {}
+
+ void schedule_generic(GenericEvent event) {
+ event_queue.emplace(EventType::GENERIC, -1, std::move(event));
+ }
+
+ void schedule_osd_message(int osd, GenericEvent callback) {
+ event_queue.emplace(EventType::OSD_MESSAGE, osd, std::move(callback));
+ }
+
+ void schedule_transaction(int osd, GenericEvent callback) {
+ event_queue.emplace(EventType::TRANSACTION, osd, std::move(callback));
+ }
+
+ void schedule_peering_message(int to_osd, GenericEvent callback) {
+ event_queue.emplace(EventType::PEERING_MESSAGE, to_osd, std::move(callback));
+ }
+
+ void schedule_cluster_message(int to_osd, GenericEvent callback) {
+ event_queue.emplace(EventType::CLUSTER_MESSAGE, to_osd, std::move(callback));
+ }
+
+ void schedule_peering_event(int osd, GenericEvent callback) {
+ event_queue.emplace(EventType::PEERING_EVENT, osd, std::move(callback));
+ }
+
+ bool has_events() const {
+ return !event_queue.empty();
+ }
+
+ size_t queued_event_count() const {
+ return event_queue.size();
+ }
+
+ int get_events_executed() const {
+ return events_executed;
+ }
+
+ const std::map<EventType, int>& get_stats_by_type() const {
+ return events_by_type;
+ }
+
+ void reset_stats() {
+ events_executed = 0;
+ events_by_type.clear();
+ }
+
+ bool run_one() {
+ if (event_queue.empty()) {
+ return false;
+ }
+
+ Event event = std::move(event_queue.front());
+ event_queue.pop();
+
+ if (verbose) {
+ std::cout << " [Event " << (events_executed + 1) << "] "
+ << event_type_name(event.type);
+ if (event.osd >= 0) {
+ std::cout << " (OSD " << event.osd << ")";
+ }
+ std::cout << " Executing..." << std::endl;
+ }
+
+ event.callback();
+ events_executed++;
+ events_by_type[event.type]++;
+
+ return true;
+ }
+
+ int run_many(int count) {
+ if (verbose) {
+ std::cout << "\n=== Running " << count << " events ===" << std::endl;
+ }
+
+ int executed = 0;
+ for (int i = 0; i < count && run_one(); i++) {
+ executed++;
+ }
+
+ if (verbose) {
+ std::cout << "=== Executed " << executed << " events, "
+ << event_queue.size() << " remaining ===" << std::endl;
+ }
+
+ return executed;
+ }
+
+ /**
+ * Run until the queue is empty or max_events is reached.
+ * Returns -1 if max_events was reached before the queue emptied.
+ */
+ int run_until_idle(int max_events = 0) {
+ if (verbose) {
+ std::cout << "\n=== Running until idle";
+ if (max_events > 0) {
+ std::cout << " (max " << max_events << " events)";
+ }
+ std::cout << " ===" << std::endl;
+ }
+
+ int executed = 0;
+ while (has_events()) {
+ if (max_events > 0 && executed >= max_events) {
+ if (verbose) {
+ std::cout << "=== Max events (" << max_events << ") reached, "
+ << event_queue.size() << " events remaining ===" << std::endl;
+ }
+ return -1; // Timeout
+ }
+
+ run_one();
+ executed++;
+ }
+
+ if (verbose) {
+ std::cout << "=== Idle: Executed " << executed << " events ===" << std::endl;
+ print_stats();
+ }
+
+ return executed;
+ }
+
+ /**
+ * Run until a condition is met, idle, or max_events is reached.
+ * The condition is checked after each event execution.
+ * Returns -1 if max_events was reached.
+ */
+ int run_until(int max_events, std::function<bool()> condition) {
+ if (verbose) {
+ std::cout << "\n=== Running until condition";
+ if (max_events > 0) {
+ std::cout << " (max " << max_events << " events)";
+ }
+ std::cout << " ===" << std::endl;
+ }
+
+ int executed = 0;
+ while (has_events()) {
+ if (max_events > 0 && executed >= max_events) {
+ if (verbose) {
+ std::cout << "=== Max events (" << max_events << ") reached ===" << std::endl;
+ }
+ return -1; // Timeout
+ }
+
+ run_one();
+ executed++;
+
+ if (condition()) {
+ if (verbose) {
+ std::cout << "=== Condition met after " << executed << " events ===" << std::endl;
+ }
+ return executed;
+ }
+ }
+
+ if (verbose) {
+ std::cout << "=== Idle: Executed " << executed << " events, condition not met ===" << std::endl;
+ }
+
+ return executed;
+ }
+
+ void clear() {
+ while (!event_queue.empty()) {
+ event_queue.pop();
+ }
+ }
+
+ void set_verbose(bool v) {
+ verbose = v;
+ }
+
+ void print_stats() const {
+ if (events_by_type.empty()) {
+ return;
+ }
+
+ std::cout << "=== Event Statistics ===" << std::endl;
+ for (const auto& [type, count] : events_by_type) {
+ std::cout << " " << event_type_name(type) << ": " << count << std::endl;
+ }
+ std::cout << " TOTAL: " << events_executed << std::endl;
+ }
+};
+
//MockConnection - simple stub. Required because PeeringState needs
//to know the features of the peer OSD which sent a peering message
class MockConnection : public Connection {
+ private:
+ int peer_osd;
+
public:
- MockConnection() : Connection(g_ceph_context, nullptr) {
+ MockConnection(int peer = -1) : Connection(g_ceph_context, nullptr), peer_osd(peer) {
set_features(CEPH_FEATURES_ALL);
}
+ int get_peer_osd() const {
+ return peer_osd;
+ }
+
bool is_connected() override {
return true;
}
return entity_addr_t();
}
};
-
#include <set>
#include "osd/PGBackend.h"
-// MockECReadPred - simple stub for IsPGReadablePredicate
-// Warning - this always returns true. This means we cannot test scenarios
-// where there are too many OSDs down and the PG should be incomplete
+/**
+ * MockECReadPred - configurable stub for IsPGReadablePredicate.
+ *
+ * When constructed with default arguments (k=0, m=0), always returns true
+ * (original behaviour, suitable for basic tests that don't need quorum
+ * checking).
+ *
+ * When constructed with real k and m values, implements proper quorum
+ * checking: the PG is readable if at least k shards are available (i.e.
+ * we have enough data shards to reconstruct the object without needing
+ * any coding shards).
+ *
+ * This enables negative testing of scenarios where too many OSDs are down
+ * and the PG should be unreadable.
+ */
class MockECReadPred : public IsPGReadablePredicate {
public:
- MockECReadPred() {}
- bool operator()(const std::set<pg_shard_t> &_have) const override {
- return true;
+ /**
+ * @param k Number of data chunks (0 = always-true mode)
+ * @param m Number of coding chunks (unused in read predicate, kept for
+ * symmetry with MockECRecPred)
+ */
+ explicit MockECReadPred(int k = 0, int m = 0) : k(k), m(m) {}
+
+ bool operator()(const std::set<pg_shard_t> &have) const override {
+ // When k==0 fall back to always-true (backward-compatible default)
+ if (k == 0) {
+ return true;
+ }
+ // Readable when we have at least k shards available
+ return static_cast<int>(have.size()) >= k;
}
+
+ private:
+ int k;
+ int m;
};
#include <set>
#include "osd/PGBackend.h"
-// MockECRecPred - simple stub for IsPGRecoverablePredicate
-// Warning - this always returns true. This means we cannot test scenarios
-// where there are too many OSDs down and the PG should be incomplete
+/**
+ * MockECRecPred - configurable stub for IsPGRecoverablePredicate.
+ *
+ * When constructed with default arguments (k=0, m=0), always returns true
+ * (original behaviour, suitable for basic tests that don't need quorum
+ * checking).
+ *
+ * When constructed with real k and m values, implements proper quorum
+ * checking: the PG is recoverable if at least k shards are available (i.e.
+ * we have enough shards to reconstruct all data, since any k-of-(k+m) EC
+ * scheme can recover from up to m failures).
+ *
+ * This enables negative testing of scenarios where too many OSDs are down
+ * and the PG should be marked Incomplete.
+ */
class MockECRecPred : public IsPGRecoverablePredicate {
public:
- MockECRecPred() {}
+ /**
+ * @param k Number of data chunks (0 = always-true mode)
+ * @param m Number of coding chunks (0 = always-true mode)
+ */
+ explicit MockECRecPred(int k = 0, int m = 0) : k(k), m(m) {}
- bool operator()(const std::set<pg_shard_t> &_have) const override {
- return true;
+ bool operator()(const std::set<pg_shard_t> &have) const override {
+ // When k==0 fall back to always-true (backward-compatible default)
+ if (k == 0) {
+ return true;
+ }
+ // Recoverable when we have at least k shards (can tolerate up to m failures)
+ return static_cast<int>(have.size()) >= k;
}
+
+ private:
+ int k;
+ int m;
};
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
-// vim: ts=8 sw=2 sts=2 expandtab
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2026 IBM
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-#pragma once
-
-#include <iostream>
-#include <sstream>
-#include <string>
-#include "common/ostream_temp.h"
-
-//MockLog - simple stub
-class MockLog : public LoggerSinkSet {
- public:
- void debug(std::stringstream& s) final
- {
- std::cout << "\n<<debug>> " << s.str() << std::endl;
- }
-
- void info(std::stringstream& s) final
- {
- std::cout << "\n<<info>> " << s.str() << std::endl;
- }
-
- void sec(std::stringstream& s) final
- {
- std::cout << "\n<<sec>> " << s.str() << std::endl;
- }
-
- void warn(std::stringstream& s) final
- {
- std::cout << "\n<<warn>> " << s.str() << std::endl;
- }
-
- void error(std::stringstream& s) final
- {
- err_count++;
- std::cout << "\n<<error>> " << s.str() << std::endl;
- }
-
- OstreamTemp info() final { return OstreamTemp(CLOG_INFO, this); }
- OstreamTemp warn() final { return OstreamTemp(CLOG_WARN, this); }
- OstreamTemp error() final { return OstreamTemp(CLOG_ERROR, this); }
- OstreamTemp sec() final { return OstreamTemp(CLOG_ERROR, this); }
- OstreamTemp debug() final { return OstreamTemp(CLOG_DEBUG, this); }
-
- void do_log(clog_type prio, std::stringstream& ss) final
- {
- switch (prio) {
- case CLOG_DEBUG:
- debug(ss);
- break;
- case CLOG_INFO:
- info(ss);
- break;
- case CLOG_SEC:
- sec(ss);
- break;
- case CLOG_WARN:
- warn(ss);
- break;
- case CLOG_ERROR:
- default:
- error(ss);
- break;
- }
- }
-
- void do_log(clog_type prio, const std::string& ss) final
- {
- switch (prio) {
- case CLOG_DEBUG:
- debug() << ss;
- break;
- case CLOG_INFO:
- info() << ss;
- break;
- case CLOG_SEC:
- sec() << ss;
- break;
- case CLOG_WARN:
- warn() << ss;
- break;
- case CLOG_ERROR:
- default:
- error() << ss;
- break;
- }
- }
-
- virtual ~MockLog() {}
-
- int err_count{0};
- int expected_err_count{0};
- void set_expected_err_count(int c) { expected_err_count = c; }
-};
-
#pragma once
-#include <functional>
-#include <list>
-#include <optional>
-#include <vector>
#include "osd/PGBackend.h"
-#include "osd/ECUtil.h"
#include "os/ObjectStore.h"
// MockPGBackend - simple stub for PGBackend
#pragma once
+#include <functional>
+#include <vector>
#include <map>
-#include <set>
-#include <optional>
#include "osd/PGBackend.h"
+#include "osd/ECBackend.h"
+#include "osd/PGLog.h"
#include "osd/OSDMap.h"
#include "osd/osd_types.h"
-#include "osd/PGLog.h"
-#include "common/intrusive_timer.h"
-#include "common/ostream_temp.h"
-#include "global/global_context.h"
+#include "osd/osd_perf_counters.h"
+#include "osd/PeeringState.h"
+#include "common/ceph_context.h"
+#include "common/TrackedOp.h"
+#include "common/perf_counters.h"
+#include "messages/MOSDPGPush.h"
#include "os/ObjectStore.h"
+#include "global/global_context.h"
+#include "test/osd/MockConnection.h"
+#include "test/osd/EventLoop.h"
+#include "osd/OpRequest.h"
-// MockPGBackendListener - simple stub for PGBackend::Listener
-class MockPGBackendListener : public PGBackend::Listener {
+// MockPGBackendListener - mock PGBackend::Listener and ECListener for multi-instance testing.
+class MockPGBackendListener : public PGBackend::Listener, public ECListener {
public:
pg_info_t info;
OSDMapRef osdmap;
- const pg_pool_t pool;
+ int64_t pool_id;
PGLog log;
DoutPrefixProvider *dpp;
pg_shard_t pg_whoami;
std::set<pg_shard_t> shardset;
+
+ // Pointer to PeeringState for tests that use full peering
+ PeeringState *peering_state = nullptr;
+
+ shard_id_set acting_recovery_backfill_shard_id_set;
std::map<pg_shard_t, pg_info_t> shard_info;
std::map<pg_shard_t, pg_missing_t> shard_missing;
std::map<hobject_t, std::set<pg_shard_t>> missing_loc_shards;
pg_missing_tracker_t local_missing;
-
- MockPGBackendListener(OSDMapRef osdmap, const pg_pool_t pi, DoutPrefixProvider *dpp, pg_shard_t pg_whoami) :
- osdmap(osdmap), pool(pi), log(g_ceph_context), dpp(dpp), pg_whoami(pg_whoami) {}
+
+ std::vector<MessageRef> sent_messages;
+ std::vector<std::pair<int, MessageRef>> sent_messages_with_dest;
+
+ ObjectStore *store = nullptr;
+ ObjectStore::CollectionHandle ch;
+ EventLoop *event_loop = nullptr;
+ std::function<bool(OpRequestRef)> handle_message_callback;
+ std::map<int, std::function<bool(OpRequestRef)>> *message_router = nullptr;
+ OpTracker *op_tracker = nullptr;
+ PerfCounters *perf_logger = nullptr;
+
+ MockPGBackendListener(OSDMapRef osdmap, int64_t pool_id, DoutPrefixProvider *dpp, pg_shard_t pg_whoami, PeeringState *ps = nullptr) :
+ osdmap(osdmap), pool_id(pool_id), log(g_ceph_context), dpp(dpp), pg_whoami(pg_whoami), peering_state(ps) {
+ // Create a full OSD PerfCounters using the standard build_osd_logger function.
+ // This prevents null pointer dereferences when ReplicatedBackend calls get_logger()->inc().
+ perf_logger = build_osd_logger(g_ceph_context);
+ }
+
+ ~MockPGBackendListener() {
+ if (perf_logger) {
+ delete perf_logger;
+ perf_logger = nullptr;
+ }
+ }
+
+ void set_store(ObjectStore *s, ObjectStore::CollectionHandle c) {
+ store = s;
+ ch = c;
+ }
+
+ void set_event_loop(EventLoop *loop) {
+ event_loop = loop;
+ }
+
+ void set_op_tracker(OpTracker *tracker) {
+ op_tracker = tracker;
+ }
+
+ void set_peering_state(PeeringState *ps) {
+ peering_state = ps;
+ }
+
+ void set_handle_message_callback(std::function<bool(OpRequestRef)> cb) {
+ handle_message_callback = cb;
+ }
+
+ void set_message_router(std::map<int, std::function<bool(OpRequestRef)>> *router) {
+ message_router = router;
+ }
// Debugging
DoutPrefixProvider *get_dpp() override {
pg_shard_t peer,
const hobject_t &oid,
const ObjectRecoveryInfo &recovery_info) override {
+ if (peering_state) {
+ peering_state->on_peer_recover(peer, oid, recovery_info.version);
+ }
}
void begin_peer_recover(
pg_shard_t peer,
const hobject_t oid) override {
+ if (peering_state) {
+ peering_state->begin_peer_recover(peer, oid);
+ }
}
void apply_stats(
return c;
}
- // Messaging
+ // Routes messages through EventLoop for asynchronous EC message processing.
void send_message(int to_osd, Message *m) override {
+ MessageRef mref(m);
+ sent_messages.push_back(mref);
+ sent_messages_with_dest.push_back({to_osd, mref});
+
+ if (event_loop && op_tracker && message_router) {
+ // Capture the sender's OSD ID
+ int from_osd = pg_whoami.osd;
+
+ // IMPORTANT: Encode the message payload to simulate network transmission
+ // This ensures that txn_payload is moved to the middle section for MOSDRepOp messages
+ // Without this, Transaction::decode will fail because the message structure is incomplete
+ mref->encode_payload(CEPH_FEATURES_ALL);
+
+ event_loop->schedule_osd_message(to_osd, [this, mref, to_osd, from_osd]() {
+ if (!mref->get_connection()) {
+ // Set connection peer to the SENDER, not the destination
+ ConnectionRef conn = new MockConnection(from_osd);
+ mref->set_connection(conn);
+ }
+ OpRequestRef op = op_tracker->create_request<OpRequest>(mref.get());
+
+ // Route to the correct shard's backend using the message router
+ auto it = message_router->find(to_osd);
+ if (it != message_router->end()) {
+ it->second(op);
+ }
+ });
+ }
}
void queue_transaction(
ObjectStore::Transaction&& t,
OpRequestRef op = OpRequestRef()) override {
+ std::vector<ObjectStore::Transaction> tls;
+ tls.push_back(std::move(t));
+ queue_transactions(tls, op);
}
void queue_transactions(
std::vector<ObjectStore::Transaction>& tls,
OpRequestRef op = OpRequestRef()) override {
+ if (event_loop && store && ch) {
+ // Steal the Context callbacks from the transactions before calling MemStore.
+ // This allows the test harness to manage the context callbacks itself instead of using
+ // a Finisher thread. This keeps the test harness single threaded and gives more
+ // control for ordering async replies.
+ Context *on_apply = nullptr;
+ Context *on_apply_sync = nullptr;
+ Context *on_commit = nullptr;
+ ObjectStore::Transaction::collect_contexts(tls, &on_apply, &on_commit, &on_apply_sync);
+
+ // Execute transactions through the store (without contexts - we stole them)
+ store->queue_transactions(ch, tls, TrackedOpRef(), nullptr);
+
+ // Apply the on_apply_sync synchronously. This is what queue_transactions
+ // would do anyway.
+ // NOTE: Memstore will panic rather than fail
+ if (on_apply_sync) {
+ on_apply_sync->complete(0);
+ }
+
+ if (on_apply) {
+ event_loop->schedule_transaction(pg_whoami.osd, [on_apply]() mutable {
+ on_apply->complete(0);
+ });
+ }
+ if (on_commit) {
+ event_loop->schedule_transaction(pg_whoami.osd, [on_commit]() mutable {
+ on_commit->complete(0);
+ });
+ }
+ }
}
epoch_t get_interval_start_epoch() const override {
+ if (peering_state) {
+ return peering_state->get_info().history.same_interval_since;
+ }
return 1;
}
epoch_t get_last_peering_reset_epoch() const override {
+ if (peering_state) {
+ return peering_state->get_last_peering_reset();
+ }
return 1;
}
return shardset;
}
+ const shard_id_set &get_acting_recovery_backfill_shard_id_set() const {
+ return acting_recovery_backfill_shard_id_set;
+ }
+
const std::set<pg_shard_t> &get_acting_shards() const override {
+ if (peering_state) {
+ return peering_state->get_actingset();
+ }
return shardset;
}
const std::set<pg_shard_t> &get_backfill_shards() const override {
+ if (peering_state) {
+ return peering_state->get_backfill_targets();
+ }
return shardset;
}
}
const std::map<hobject_t, std::set<pg_shard_t>> &get_missing_loc_shards() const override {
+ if (peering_state) {
+ return peering_state->get_missing_loc().get_missing_locs();
+ }
return missing_loc_shards;
}
const pg_missing_tracker_t &get_local_missing() const override {
+ if (peering_state) {
+ return peering_state->get_pg_log().get_missing();
+ }
return local_missing;
}
void add_local_next_event(const pg_log_entry_t& e) override {
+ if (peering_state) {
+ peering_state->add_local_next_event(e);
+ }
}
const std::map<pg_shard_t, pg_missing_t> &get_shard_missing() const override {
+ if (peering_state) {
+ return peering_state->get_peer_missing();
+ }
return shard_missing;
}
const pg_missing_const_i &get_shard_missing(pg_shard_t peer) const override {
+ if (peering_state) {
+ auto m = maybe_get_shard_missing(peer);
+ ceph_assert(m);
+ return *m;
+ }
return local_missing;
}
const std::map<pg_shard_t, pg_info_t> &get_shard_info() const override {
+ if (peering_state) {
+ return peering_state->get_peer_info();
+ }
return shard_info;
}
const PGLog &get_log() const override {
+ if (peering_state) {
+ return peering_state->get_pg_log();
+ }
return log;
}
bool pgb_is_primary() const override {
- return true;
+ // For peering tests, use the PeeringState's view of primary
+ if (peering_state) {
+ return peering_state->is_primary();
+ }
+
+ // For basic tests without peering, query the OSDMap to determine primary
+ // This uses pg_temp if set, otherwise uses the CRUSH mapping
+ std::vector<int> acting;
+ int acting_primary = -1;
+ osdmap->pg_to_acting_osds(info.pgid.pgid, &acting, &acting_primary);
+
+ return pg_whoami.osd == acting_primary;
}
const OSDMapRef& pgb_get_osdmap() const override {
}
const pg_info_t &get_info() const override {
+ // When PeeringState is available, use its pg_info_t as the single source of truth
+ if (peering_state) {
+ return peering_state->get_info();
+ }
return info;
}
const pg_pool_t &get_pool() const override {
- return pool;
+ const pg_pool_t *p = osdmap->get_pg_pool(pool_id);
+ ceph_assert(p != nullptr);
+ return *p;
}
eversion_t get_pg_committed_to() const override {
+ if (peering_state) {
+ return peering_state->get_pg_committed_to();
+ }
return eversion_t();
}
bool transaction_applied,
ObjectStore::Transaction &t,
bool async = false) override {
+ // If we have a PeeringState, append the log entries to it
+ // This creates proper integration between backend operations and peering state
+ if (peering_state && !logv.empty()) {
+ peering_state->append_log(
+ std::move(logv),
+ trim_to,
+ roll_forward_to,
+ pg_committed_to,
+ t,
+ transaction_applied,
+ async);
+ }
}
void pgb_set_object_snap_mapping(
void update_peer_last_complete_ondisk(
pg_shard_t fromosd,
eversion_t lcod) override {
+ if (peering_state) {
+ peering_state->update_peer_last_complete_ondisk(fromosd, lcod);
+ }
}
void update_last_complete_ondisk(eversion_t lcod) override {
+ if (peering_state) {
+ peering_state->update_last_complete_ondisk(lcod);
+ }
}
void update_pct(eversion_t pct) override {
+ if (peering_state) {
+ peering_state->update_pct(pct);
+ }
}
void update_stats(const pg_stat_t &stat) override {
+ if (peering_state) {
+ peering_state->update_stats(
+ [&stat](auto &history, auto &stats) {
+ stats = stat;
+ return false;
+ });
+ }
}
void schedule_recovery_work(
}
pg_shard_t primary_shard() const override {
- return pg_shard_t();
+ if (peering_state) {
+ return peering_state->get_primary();
+ }
+
+ // Query the OSDMap to get the current primary
+ pg_t pgid = info.pgid.pgid;
+ std::vector<int> acting;
+ int acting_primary = -1;
+ osdmap->pg_to_acting_osds(pgid, &acting, &acting_primary);
+
+ // For EC pools, the primary shard ID matches the OSD ID in the acting set
+ // For replicated pools, use NO_SHARD
+ if (pg_whoami.shard != shard_id_t::NO_SHARD) {
+ // EC pool: find the shard ID of the acting primary in the acting set
+ shard_id_t primary_shard_id = shard_id_t::NO_SHARD;
+ for (size_t i = 0; i < acting.size(); i++) {
+ if (acting[i] == acting_primary) {
+ primary_shard_id = shard_id_t(i);
+ break;
+ }
+ }
+ return pg_shard_t(acting_primary, primary_shard_id);
+ } else {
+ // Replicated pool: use NO_SHARD
+ return pg_shard_t(acting_primary, shard_id_t::NO_SHARD);
+ }
}
uint64_t min_peer_features() const override {
+ if (peering_state) {
+ return peering_state->get_min_peer_features();
+ }
return CEPH_FEATURES_ALL;
}
uint64_t min_upacting_features() const override {
+ if (peering_state) {
+ return peering_state->get_min_upacting_features();
+ }
return CEPH_FEATURES_ALL;
}
pg_feature_vec_t get_pg_acting_features() const override {
+ if (peering_state) {
+ return peering_state->get_pg_acting_features();
+ }
return pg_feature_vec_t();
}
void send_message_osd_cluster(
int peer, Message *m, epoch_t from_epoch) override {
+ send_message(peer, m);
}
void send_message_osd_cluster(
std::vector<std::pair<int, Message*>>& messages, epoch_t from_epoch) override {
+ for (auto& [osd, m] : messages) {
+ send_message(osd, m);
+ }
}
- void send_message_osd_cluster(MessageRef, Connection *con) override {
+ void send_message_osd_cluster(MessageRef m, Connection *con) override {
+ MockConnection* mock_con = dynamic_cast<MockConnection*>(con);
+ send_message(mock_con->get_peer_osd(), m.get());
}
void send_message_osd_cluster(Message *m, const ConnectionRef& con) override {
+ MockConnection* mock_con = dynamic_cast<MockConnection*>(con.get());
+ send_message(mock_con->get_peer_osd(), m);
}
void start_mon_command(
}
PerfCounters *get_logger() override {
- return nullptr;
+ return perf_logger;
}
ceph_tid_t get_tid() override {
bool maybe_preempt_replica_scrub(const hobject_t& oid) override {
return false;
}
+ void add_temp_obj(const hobject_t &oid) override {
+ }
+
+ void clear_temp_obj(const hobject_t &oid) override {
+ }
+
+ const pg_missing_const_i * maybe_get_shard_missing(
+ pg_shard_t peer) const override {
+ if (peering_state) {
+ if (peer == peering_state->get_primary()) {
+ return &peering_state->get_pg_log().get_missing();
+ } else {
+ auto i = peering_state->get_peer_missing().find(peer);
+ if (i == peering_state->get_peer_missing().end()) {
+ return nullptr;
+ } else {
+ return &(i->second);
+ }
+ }
+ }
+ return &local_missing;
+ }
+
+ const pg_info_t &get_shard_info(pg_shard_t peer) const override {
+ if (peering_state) {
+ if (peer == peering_state->get_primary()) {
+ return peering_state->get_info();
+ } else {
+ auto i = peering_state->get_peer_info().find(peer);
+ ceph_assert(i != peering_state->get_peer_info().end());
+ return i->second;
+ }
+ }
+
+ auto it = shard_info.find(peer);
+ if (it != shard_info.end()) {
+ return it->second;
+ }
+ return info;
+ }
+
+ bool is_missing_object(const hobject_t& oid) const override {
+ return false;
+ }
+ void send_message_osd_cluster(
+ int osd, MOSDPGPush* msg, epoch_t from_epoch) override {
+ send_message(osd, msg);
+ }
struct ECListener *get_eclistener() override {
- return nullptr;
+ return static_cast<ECListener *>(this);
}
};
#include "osd/PGLog.h"
#include "os/ObjectStore.h"
-#include "test/osd/MockPGBackend.h"
-
-// dout using global context and OSD subsystem
-#define dout_context g_ceph_context
-#define dout_subsys ceph_subsys_osd
+#include "MockPGBackend.h"
// MockPGLogEntryHandler
//
// LogEntryHandler
void remove(const hobject_t &hoid) override {
- dout(0) << "MockPGLogEntryHandler::remove " << hoid << dendl;
+ lgeneric_dout(g_ceph_context, 0) << "MockPGLogEntryHandler::remove " << hoid << dendl;
backend->remove(hoid, t);
}
void try_stash(const hobject_t &hoid, version_t v) override {
- dout(0) << "MockPGLogEntryHandler::try_stash " << hoid << " " << v << dendl;
+ lgeneric_dout(g_ceph_context, 0) << "MockPGLogEntryHandler::try_stash " << hoid << " " << v << dendl;
backend->try_stash(hoid, v, t);
}
void rollback(const pg_log_entry_t &entry) override {
- dout(0) << "MockPGLogEntryHandler::rollback " << entry << dendl;
+ lgeneric_dout(g_ceph_context, 0) << "MockPGLogEntryHandler::rollback " << entry << dendl;
ceph_assert(entry.can_rollback());
backend->rollback(entry, t);
}
void rollforward(const pg_log_entry_t &entry) override {
- dout(0) << "MockPGLogEntryHandler::rollforward " << entry << dendl;
+ lgeneric_dout(g_ceph_context, 0) << "MockPGLogEntryHandler::rollforward " << entry << dendl;
backend->rollforward(entry, t);
}
void trim(const pg_log_entry_t &entry) override {
- dout(0) << "MockPGLogEntryHandler::trim " << entry << dendl;
+ lgeneric_dout(g_ceph_context, 0) << "MockPGLogEntryHandler::trim " << entry << dendl;
backend->trim(entry, t);
}
void partial_write(pg_info_t *info, eversion_t previous_version,
const pg_log_entry_t &entry
) override {
- dout(0) << "MockPGLogEntryHandler::partial_write " << entry << dendl;
+ lgeneric_dout(g_ceph_context, 0) << "MockPGLogEntryHandler::partial_write " << entry << dendl;
backend->partial_write(info, previous_version, entry);
}
};
-#undef dout_context
-#undef dout_subsys
-
#pragma once
-#include <list>
-#include <map>
#include <memory>
-#include <set>
-#include <string>
#include <vector>
+#include <list>
+#include <map>
#include "osd/PeeringState.h"
#include "osd/osd_perf_counters.h"
-#include "common/perf_counters_collection.h"
-#include "global/global_context.h"
+#include "common/HeartbeatMap.h"
#include "os/ObjectStore.h"
-#include "test/osd/MockLog.h"
-#include "test/osd/MockPGBackend.h"
-#include "test/osd/MockPGBackendListener.h"
-#include "test/osd/MockPGLogEntryHandler.h"
+#include "MockPGBackendListener.h"
+#include "MockPGBackend.h"
+#include "MockPGLogEntryHandler.h"
+#include "global/global_context.h"
-// dout using global context and OSD subsystem
#define dout_context g_ceph_context
#define dout_subsys ceph_subsys_osd
-using namespace std;
-
-// Mock PeeringListener - stub of PeeringState::PeeringListener
-// to help with testing of PeeringState. Keep track of calls
-// from PeeringState and emulate some of PrimaryLogPG/PG
-// functionality for testing purposes.
-//
-// There are some inject_* variables that can be used to help
-// tests create race hazards or test failure paths
+// Mock implementation of PeeringState::PeeringListener for testing.
+// inject_* variables can be used to create race hazards or test failure paths.
class MockPeeringListener : public PeeringState::PeeringListener {
public:
pg_shard_t pg_whoami;
- MockLog logger;
PeeringState *ps;
- unique_ptr<MockPGBackendListener> backend_listener;
+ std::unique_ptr<MockPGBackendListener> backend_listener;
coll_t coll;
ObjectStore::CollectionHandle ch;
- unique_ptr<MockPGBackend> backend;
+ std::unique_ptr<MockPGBackend> backend;
PerfCounters* recoverystate_perf;
PerfCounters* logger_perf;
std::vector<int> next_acting;
// migration requests with too full
bool inject_fail_reserve_recovery_space = false;
+ std::function<int(ObjectStore::Transaction&&)> queue_transaction_callback;
+
MockPeeringListener(OSDMapRef osdmap,
- const pg_pool_t pi,
+ int64_t pool_id,
DoutPrefixProvider *dpp,
pg_shard_t pg_whoami) : pg_whoami(pg_whoami) {
- backend_listener = make_unique<MockPGBackendListener>(osdmap, pi, dpp, pg_whoami);
- backend = make_unique<MockPGBackend>(g_ceph_context, backend_listener.get(), nullptr, coll, ch);
+ backend_listener = std::make_unique<MockPGBackendListener>(osdmap, pool_id, dpp, pg_whoami);
+ backend = std::make_unique<MockPGBackend>(g_ceph_context, backend_listener.get(), nullptr, coll, ch);
recoverystate_perf = build_recoverystate_perf(g_ceph_context);
g_ceph_context->get_perfcounters_collection()->add(recoverystate_perf);
logger_perf = build_osd_logger(g_ceph_context);
g_ceph_context->get_perfcounters_collection()->add(logger_perf);
}
- // EpochSource interface
+ ~MockPeeringListener() {
+ if (recoverystate_perf) {
+ g_ceph_context->get_perfcounters_collection()->remove(recoverystate_perf);
+ delete recoverystate_perf;
+ recoverystate_perf = nullptr;
+ }
+ if (logger_perf) {
+ g_ceph_context->get_perfcounters_collection()->remove(logger_perf);
+ delete logger_perf;
+ logger_perf = nullptr;
+ }
+ }
+
epoch_t get_osdmap_epoch() const override {
return current_epoch;
}
bool need_write_epoch,
ObjectStore::Transaction &t) override {
prepare_write_called = true;
+
+ // If a callback is set, queue the transaction
+ if (queue_transaction_callback && !t.empty()) {
+ ObjectStore::Transaction copy;
+ copy.append(t);
+ queue_transaction_callback(std::move(copy));
+ }
}
void scrub_requested(scrub_level_t scrub_level, scrub_type_t scrub_type) override {
}
void log_state_enter(const char *state) override {
- last_state_entered = string(state);
+ last_state_entered = std::string(state);
state_entered = true;
}
void log_state_exit(
const char *state_name, utime_t enter_time,
uint64_t events, utime_t event_dur) override {
- last_state_exited = string(state_name);
+ last_state_exited = std::string(state_name);
state_exited = true;
}
}
OstreamTemp get_clog_info() override {
- return logger.info();
+ return OstreamTemp(CLOG_INFO, nullptr);
}
OstreamTemp get_clog_error() override {
- return logger.error();
+ return OstreamTemp(CLOG_ERROR, nullptr);
}
OstreamTemp get_clog_debug() override {
- return logger.debug();
+ return OstreamTemp(CLOG_DEBUG, nullptr);
}
void on_activate_complete() override {
removal_called = true;
}
- // Test state tracking
unsigned target_pg_log_entries = 100;
bool renew_lease_scheduled = false;
bool check_readable_queued = false;
bool recovery_space_reserved = false;
bool recovery_space_unreserved = false;
bool missing_set_rebuilt = false;
- string last_state_entered;
+ std::string last_state_entered;
bool state_entered = false;
- string last_state_exited;
+ std::string last_state_exited;
bool state_exited = false;
mutable bool recovery_info_dumped = false;
epoch_t current_epoch = 1;
bool first_write_in_interval = false;
};
-#undef dout_context
-#undef dout_subsys
-
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 sts=2 expandtab
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2026 IBM
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+#include "osd/OSDMap.h"
+#include "osd/osd_types.h"
+
+// Utility functions for managing OSDMap state in tests.
+// (Previously in OSDMapTestHelpers.h — embedded here as the sole user.)
+class OSDMapTestHelpers {
+public:
+ // Add or update a pool in the OSDMap. Pass pool_id=-1 to auto-assign.
+ static int64_t add_pool(
+ OSDMap& osdmap,
+ int64_t pool_id,
+ const pg_pool_t& pool,
+ const std::string& pool_name = "")
+ {
+ if (pool_id < 0) {
+ pool_id = osdmap.get_pool_max() + 1;
+ }
+
+ std::string name = pool_name.empty() ?
+ ("pool_" + std::to_string(pool_id)) : pool_name;
+
+ // Use OSDMap::Incremental to properly add pool and pool name
+ // This ensures both pools map and pool_name map are updated correctly
+ OSDMap::Incremental inc(osdmap.get_epoch() + 1);
+ inc.fsid = osdmap.get_fsid();
+ inc.new_pools[pool_id] = pool;
+ inc.new_pool_names[pool_id] = name;
+
+ osdmap.apply_incremental(inc);
+
+ return pool_id;
+ }
+
+ static int64_t add_pool(
+ std::shared_ptr<OSDMap> osdmap,
+ int64_t pool_id,
+ const pg_pool_t& pool,
+ const std::string& pool_name = "")
+ {
+ return add_pool(*osdmap, pool_id, pool, pool_name);
+ }
+
+ static const pg_pool_t* get_pool(
+ const OSDMap& osdmap,
+ int64_t pool_id)
+ {
+ return osdmap.get_pg_pool(pool_id);
+ }
+
+ static const pg_pool_t* get_pool(
+ const std::shared_ptr<OSDMap>& osdmap,
+ int64_t pool_id)
+ {
+ return get_pool(*osdmap, pool_id);
+ }
+
+ // Set acting set for a PG using pg_temp (standard Ceph mechanism for overriding CRUSH).
+ // For EC pools with nonprimary_shards optimization, pg_temp must be stored in
+ // "primaryfirst" order (primary-capable shards first). This simulates what the
+ // monitor does in production when initially setting up pg_temp.
+ static void set_pg_acting(
+ OSDMap& osdmap,
+ pg_t pgid,
+ const std::vector<int>& acting)
+ {
+ OSDMap::Incremental inc(osdmap.get_epoch() + 1);
+ inc.fsid = osdmap.get_fsid();
+
+ if (acting.empty()) {
+ // Empty acting set means remove pg_temp
+ inc.new_pg_temp[pgid] = mempool::osdmap::vector<int32_t>();
+ } else {
+ // For EC pools with optimizations, transform to primaryfirst order.
+ // This is used for initial setup. For dynamic changes during peering,
+ // the test should let peering detect invalid primaries and request
+ // corrections via queue_want_pg_temp().
+ std::vector<int> transformed_acting = acting;
+ const pg_pool_t* pool = osdmap.get_pg_pool(pgid.pool());
+ if (pool && pool->allows_ecoptimizations()) {
+ transformed_acting = osdmap.pgtemp_primaryfirst(*pool, acting);
+ }
+
+ mempool::osdmap::vector<int32_t> temp_acting;
+ for (int osd : transformed_acting) {
+ temp_acting.push_back(osd);
+ }
+ inc.new_pg_temp[pgid] = temp_acting;
+ }
+
+ osdmap.apply_incremental(inc);
+ }
+
+ static void set_pg_acting(
+ std::shared_ptr<OSDMap> osdmap,
+ pg_t pgid,
+ const std::vector<int>& acting)
+ {
+ set_pg_acting(*osdmap, pgid, acting);
+ }
+
+ static bool get_pg_acting(
+ const OSDMap& osdmap,
+ pg_t pgid,
+ std::vector<int>& acting)
+ {
+ acting.clear();
+ int primary;
+ osdmap.pg_to_acting_osds(pgid, &acting, &primary);
+ return !acting.empty();
+ }
+
+ static bool get_pg_acting(
+ const std::shared_ptr<OSDMap>& osdmap,
+ pg_t pgid,
+ std::vector<int>& acting)
+ {
+ return get_pg_acting(*osdmap, pgid, acting);
+ }
+
+ static void set_pg_acting_primary(
+ OSDMap& osdmap,
+ pg_t pgid,
+ int primary)
+ {
+ OSDMap::Incremental inc(osdmap.get_epoch() + 1);
+ inc.fsid = osdmap.get_fsid();
+ inc.new_primary_temp[pgid] = primary;
+ osdmap.apply_incremental(inc);
+ }
+
+ static void set_pg_acting_primary(
+ std::shared_ptr<OSDMap> osdmap,
+ pg_t pgid,
+ int primary)
+ {
+ set_pg_acting_primary(*osdmap, pgid, primary);
+ }
+
+ static bool get_pg_acting_primary(
+ const OSDMap& osdmap,
+ pg_t pgid,
+ int& primary)
+ {
+ std::vector<int> acting;
+ osdmap.pg_to_acting_osds(pgid, &acting, &primary);
+ return primary >= 0;
+ }
+
+ static bool get_pg_acting_primary(
+ const std::shared_ptr<OSDMap>& osdmap,
+ pg_t pgid,
+ int& primary)
+ {
+ return get_pg_acting_primary(*osdmap, pgid, primary);
+ }
+
+ static pg_pool_t create_ec_pool(
+ int k,
+ int m,
+ uint64_t stripe_width,
+ uint64_t flags,
+ int64_t pool_id = 0)
+ {
+ pg_pool_t pool;
+ pool.type = pg_pool_t::TYPE_ERASURE;
+ pool.size = k + m;
+ pool.min_size = k;
+ pool.crush_rule = 0;
+ pool.erasure_code_profile = "default";
+ pool.stripe_width = stripe_width;
+
+ // Set flags as specified by caller
+ pool.flags = flags;
+
+ // Only set nonprimary_shards if OPTIMIZATIONS flag is set
+ if (flags & pg_pool_t::FLAG_EC_OPTIMIZATIONS) {
+ // Mark shards 1 to k-1 (inclusive) as nonprimary
+ // Shard 0 can be primary, shards k to k+m-1 (coding shards) can be primary
+ for (int i = 1; i < k; i++) {
+ pool.nonprimary_shards.insert(shard_id_t(i));
+ }
+ }
+
+ return pool;
+ }
+
+ static pg_pool_t create_replicated_pool(
+ int size,
+ int min_size,
+ int64_t pool_id = 0)
+ {
+ pg_pool_t pool;
+ pool.type = pg_pool_t::TYPE_REPLICATED;
+ pool.size = size;
+ pool.min_size = min_size;
+ pool.crush_rule = 0;
+
+ return pool;
+ }
+
+ static void setup_ec_pg(
+ OSDMap& osdmap,
+ pg_t pgid,
+ int k,
+ int m,
+ int primary_shard = 0)
+ {
+ std::vector<int> acting;
+ for (int i = 0; i < k + m; i++) {
+ acting.push_back(i);
+ }
+ set_pg_acting(osdmap, pgid, acting);
+ // Don't set primary_temp for EC pools - let OSDMap determine the primary
+ // based on the pool's nonprimary_shards configuration
+ // set_pg_acting_primary(osdmap, pgid, primary_shard);
+ }
+
+ static void setup_ec_pg(
+ std::shared_ptr<OSDMap> osdmap,
+ pg_t pgid,
+ int k,
+ int m,
+ int primary_shard = 0)
+ {
+ setup_ec_pg(*osdmap, pgid, k, m, primary_shard);
+ }
+
+ // Copy the pool, unset the flag, then apply via incremental.
+ static void clear_pool_flag(
+ OSDMap& osdmap,
+ int64_t pool_id,
+ uint64_t flag)
+ {
+ const pg_pool_t* existing = osdmap.get_pg_pool(pool_id);
+ ceph_assert(existing != nullptr);
+
+ pg_pool_t updated = *existing;
+ updated.unset_flag(flag);
+
+ OSDMap::Incremental inc(osdmap.get_epoch() + 1);
+ inc.fsid = osdmap.get_fsid();
+ inc.new_pools[pool_id] = updated;
+ osdmap.apply_incremental(inc);
+ }
+
+ static void clear_pool_flag(
+ std::shared_ptr<OSDMap> osdmap,
+ int64_t pool_id,
+ uint64_t flag)
+ {
+ clear_pool_flag(*osdmap, pool_id, flag);
+ }
+
+ // OSD state manipulation methods
+
+ /**
+ * Mark an OSD as down (exists but not UP) in the OSDMap.
+ * Creates a new epoch.
+ *
+ * @param osdmap The OSDMap to modify
+ * @param osd_id The OSD to mark as down
+ */
+ static void mark_osd_down(OSDMap& osdmap, int osd_id)
+ {
+ OSDMap::Incremental inc(osdmap.get_epoch() + 1);
+ inc.fsid = osdmap.get_fsid();
+ inc.new_state[osd_id] = CEPH_OSD_EXISTS; // Mark as down (exists but not UP)
+ osdmap.apply_incremental(inc);
+ }
+
+ static void mark_osd_down(std::shared_ptr<OSDMap> osdmap, int osd_id)
+ {
+ mark_osd_down(*osdmap, osd_id);
+ }
+
+ /**
+ * Mark an OSD as up in the OSDMap.
+ * Creates a new epoch.
+ *
+ * @param osdmap The OSDMap to modify
+ * @param osd_id The OSD to mark as up
+ */
+ static void mark_osd_up(OSDMap& osdmap, int osd_id)
+ {
+ OSDMap::Incremental inc(osdmap.get_epoch() + 1);
+ inc.fsid = osdmap.get_fsid();
+ inc.new_state[osd_id] = CEPH_OSD_EXISTS | CEPH_OSD_UP;
+ osdmap.apply_incremental(inc);
+ }
+
+ static void mark_osd_up(std::shared_ptr<OSDMap> osdmap, int osd_id)
+ {
+ mark_osd_up(*osdmap, osd_id);
+ }
+
+ /**
+ * Mark multiple OSDs as down in the OSDMap.
+ * Creates a new epoch.
+ *
+ * @param osdmap The OSDMap to modify
+ * @param osd_ids The OSDs to mark as down
+ */
+ static void mark_osds_down(OSDMap& osdmap, const std::vector<int>& osd_ids)
+ {
+ OSDMap::Incremental inc(osdmap.get_epoch() + 1);
+ inc.fsid = osdmap.get_fsid();
+ for (int osd_id : osd_ids) {
+ inc.new_state[osd_id] = CEPH_OSD_EXISTS; // Mark as down (exists but not UP)
+ }
+ osdmap.apply_incremental(inc);
+ }
+
+ static void mark_osds_down(std::shared_ptr<OSDMap> osdmap, const std::vector<int>& osd_ids)
+ {
+ mark_osds_down(*osdmap, osd_ids);
+ }
+
+ /**
+ * Advance to a new epoch without changing OSD states.
+ * Useful for testing re-peering scenarios.
+ *
+ * @param osdmap The OSDMap to modify
+ */
+ static void advance_epoch(OSDMap& osdmap)
+ {
+ OSDMap::Incremental inc(osdmap.get_epoch() + 1);
+ inc.fsid = osdmap.get_fsid();
+ osdmap.apply_incremental(inc);
+ }
+
+ static void advance_epoch(std::shared_ptr<OSDMap> osdmap)
+ {
+ advance_epoch(*osdmap);
+ }
+};
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 sts=2 expandtab
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2026 IBM
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "test/osd/PGBackendTestFixture.h"
+#include "common/errno.h"
+
+void PGBackendTestFixture::setup_ec_pool()
+{
+ CephContext *cct = g_ceph_context;
+
+ osdmap = std::make_shared<OSDMap>();
+ osdmap->set_max_osd(k + m);
+
+ for (int i = 0; i < k + m; i++) {
+ osdmap->set_state(i, CEPH_OSD_EXISTS);
+ osdmap->set_weight(i, CEPH_OSD_OUT);
+ osdmap->crush->set_item_name(i, "osd." + std::to_string(i));
+ }
+
+ // Use incremental to set OSDs as up and with proper features
+ OSDMap::Incremental inc(osdmap->get_epoch() + 1);
+ inc.fsid = osdmap->get_fsid();
+
+ for (int i = 0; i < k + m; i++) {
+ inc.new_state[i] = CEPH_OSD_UP;
+ inc.new_weight[i] = CEPH_OSD_IN;
+
+ // Set up_thru to a high value to avoid WaitUpThru state during initial peering
+ // The OSDMap will go through several increments (adding pools, etc.) so we need
+ // up_thru to be higher than the final epoch
+ inc.new_up_thru[i] = 100;
+
+ // Set OSD features to include NAUTILUS, OCTOPUS and QUINCY server features (required for peering)
+ osd_xinfo_t xinfo;
+ xinfo.features = CEPH_FEATUREMASK_SERVER_NAUTILUS | CEPH_FEATUREMASK_SERVER_OCTOPUS | CEPH_FEATUREMASK_SERVER_QUINCY;
+ inc.new_xinfo[i] = xinfo;
+ }
+
+ // Apply the incremental to set state, weight, and features
+ // This will properly calculate up_osd_features
+ osdmap->apply_incremental(inc);
+
+ pg_pool_t pool = OSDMapTestHelpers::create_ec_pool(k, m, stripe_unit * k, pool_flags, pool_id);
+ OSDMapTestHelpers::add_pool(osdmap, pool_id, pool);
+
+ pgid = pg_t(0, pool_id);
+ spgid = spg_t(pgid, shard_id_t(0));
+
+ OSDMapTestHelpers::setup_ec_pg(osdmap, pgid, k, m, 0);
+
+ // Finalize the CRUSH map to calculate working_size
+ // This is required for crush_init_workspace() to work correctly
+ osdmap->crush->finalize();
+
+ if (ec_plugin == "mock") {
+ ec_impl = std::make_shared<MockErasureCode>(k, k + m);
+ } else {
+ ErasureCodeProfile profile;
+ profile["k"] = std::to_string(k);
+ profile["m"] = std::to_string(m);
+ profile["plugin"] = ec_plugin;
+
+ if (!ec_technique.empty()) {
+ profile["technique"] = ec_technique;
+ }
+
+ profile["stripe_unit"] = std::to_string(stripe_unit);
+
+ std::stringstream ss;
+ // Tests are run from the build directory, so "./lib" points to the
+ // erasure code plugins in the build tree rather than /usr/local/lib64/ceph/erasure-code/
+ int ret = ceph::ErasureCodePluginRegistry::instance().factory(
+ ec_plugin,
+ "./lib",
+ profile,
+ &ec_impl,
+ &ss);
+
+ if (ret != 0) {
+ FAIL() << "Failed to create EC plugin '" << ec_plugin << "': " << ss.str();
+ return;
+ }
+ }
+
+ ObjectStore::Transaction t;
+ for (int i = 0; i < k + m; i++) {
+ spg_t shard_spgid(pgid, shard_id_t(i));
+ coll_t shard_coll(shard_spgid);
+ auto shard_ch = store->create_new_collection(shard_coll);
+ t.create_collection(shard_coll, 0);
+
+ colls[i] = shard_coll;
+ chs[i] = shard_ch;
+
+ if (i == 0) {
+ ch = shard_ch;
+ coll = shard_coll;
+ }
+ }
+
+ ASSERT_EQ(store->queue_transaction(ch, std::move(t)), 0);
+
+ const pg_pool_t* pool_ptr = OSDMapTestHelpers::get_pool(osdmap, pool_id);
+ ceph_assert(pool_ptr != nullptr);
+
+ for (int i = 0; i < k + m; i++) {
+ std::unique_ptr<MockPGBackendListener> shard_listener;
+ if (listener_factory) {
+ shard_listener = listener_factory(
+ i,
+ osdmap,
+ pool_id,
+ dpp.get(),
+ pg_shard_t(i, shard_id_t(i)));
+ } else {
+ shard_listener = std::make_unique<MockPGBackendListener>(
+ osdmap,
+ pool_id,
+ dpp.get(),
+ pg_shard_t(i, shard_id_t(i))
+ );
+ }
+
+ // Initialize the listener's own info.pgid so OSDMap queries work
+ shard_listener->info.pgid = spg_t(pgid, shard_id_t(i));
+
+ for (int j = 0; j < k + m; j++) {
+ shard_listener->shardset.insert(pg_shard_t(j, shard_id_t(j)));
+ shard_listener->acting_recovery_backfill_shard_id_set.insert(shard_id_t(j));
+
+ // Initialize shard_info for each shard - required by EC backend
+ pg_info_t shard_pg_info;
+ shard_pg_info.pgid = spg_t(pgid, shard_id_t(j));
+ shard_listener->shard_info[pg_shard_t(j, shard_id_t(j))] = shard_pg_info;
+
+ // Initialize shard_missing for each shard - required by EC backend
+ pg_missing_t shard_missing;
+ shard_listener->shard_missing[pg_shard_t(j, shard_id_t(j))] = shard_missing;
+ }
+
+ shard_listener->set_store(store.get(), chs[i]);
+ shard_listener->set_event_loop(event_loop.get());
+ shard_listener->set_op_tracker(op_tracker.get());
+
+ auto shard_lru = std::make_unique<ECExtentCache::LRU>(1024 * 1024 * 100);
+ auto shard_ec_switch = std::make_unique<ECSwitch>(
+ shard_listener.get(), colls[i], chs[i], store.get(),
+ cct, ec_impl, stripe_unit * k, *shard_lru);
+
+ listeners[i] = std::move(shard_listener);
+ lrus[i] = std::move(shard_lru);
+ backends[i] = std::move(shard_ec_switch);
+ }
+
+ for (int i = 0; i < k + m; i++) {
+ message_router[i] = [this, i](OpRequestRef op) -> bool {
+ return backends[i]->_handle_message(op);
+ };
+ }
+
+ for (int i = 0; i < k + m; i++) {
+ listeners[i]->set_message_router(&message_router);
+ listeners[i]->set_handle_message_callback(
+ [this, i](OpRequestRef op) -> bool {
+ return backends[i]->_handle_message(op);
+ });
+ }
+}
+
+void PGBackendTestFixture::setup_replicated_pool()
+{
+ CephContext *cct = g_ceph_context;
+
+ osdmap = std::make_shared<OSDMap>();
+ osdmap->set_max_osd(num_replicas);
+ osdmap->set_state(0, CEPH_OSD_EXISTS | CEPH_OSD_UP);
+
+ pg_pool_t pool;
+ pool.type = pg_pool_t::TYPE_REPLICATED;
+ pool.size = num_replicas;
+ pool.min_size = min_size;
+ pool.crush_rule = 0;
+
+ osdmap->inc_epoch();
+
+ OSDMapTestHelpers::add_pool(osdmap, pool_id, pool);
+
+ // Finalize the CRUSH map to calculate working_size
+ // This is required for crush_init_workspace() to work correctly
+ osdmap->crush->finalize();
+
+ pgid = pg_t(0, pool_id);
+ spgid = spg_t(pgid, shard_id_t::NO_SHARD);
+
+ // Set up pg_temp to define the acting set with OSD 0 as primary
+ std::vector<int> acting;
+ for (int i = 0; i < num_replicas; i++) {
+ acting.push_back(i);
+ }
+ OSDMapTestHelpers::set_pg_acting(osdmap, pgid, acting);
+ OSDMapTestHelpers::set_pg_acting_primary(osdmap, pgid, 0);
+
+ ObjectStore::Transaction t;
+ spg_t replica_spgid(pgid, shard_id_t::NO_SHARD);
+ coll_t replica_coll(replica_spgid);
+ auto replica_ch = store->create_new_collection(replica_coll);
+ t.create_collection(replica_coll, 0);
+
+ ASSERT_EQ(store->queue_transaction(replica_ch, std::move(t)), 0);
+
+ // All replicas share the same collection
+ for (int i = 0; i < num_replicas; i++) {
+ colls[i] = replica_coll;
+ chs[i] = replica_ch;
+ }
+
+ ch = replica_ch;
+ coll = replica_coll;
+
+ const pg_pool_t* pool_ptr = OSDMapTestHelpers::get_pool(osdmap, pool_id);
+ ceph_assert(pool_ptr != nullptr);
+
+ for (int i = 0; i < num_replicas; i++) {
+ std::unique_ptr<MockPGBackendListener> replica_listener;
+ if (listener_factory) {
+ replica_listener = listener_factory(
+ i,
+ osdmap,
+ pool_id,
+ dpp.get(),
+ pg_shard_t(i, shard_id_t::NO_SHARD));
+ } else {
+ replica_listener = std::make_unique<MockPGBackendListener>(
+ osdmap,
+ pool_id,
+ dpp.get(),
+ pg_shard_t(i, shard_id_t::NO_SHARD)
+ );
+ }
+
+ // Initialize the listener's own info.pgid so OSDMap queries work
+ replica_listener->info.pgid = spg_t(pgid, shard_id_t::NO_SHARD);
+
+ // For replicated pools, use NO_SHARD for all replicas
+ for (int j = 0; j < num_replicas; j++) {
+ replica_listener->shardset.insert(pg_shard_t(j, shard_id_t::NO_SHARD));
+
+ // Initialize shard_info for each replica - required by backend
+ pg_info_t replica_pg_info;
+ replica_pg_info.pgid = spg_t(pgid, shard_id_t::NO_SHARD);
+ replica_listener->shard_info[pg_shard_t(j, shard_id_t::NO_SHARD)] = replica_pg_info;
+
+ // Initialize shard_missing for each replica - required by backend
+ pg_missing_t replica_missing;
+ replica_listener->shard_missing[pg_shard_t(j, shard_id_t::NO_SHARD)] = replica_missing;
+ }
+
+ replica_listener->set_store(store.get(), chs[i]);
+ replica_listener->set_event_loop(event_loop.get());
+ replica_listener->set_op_tracker(op_tracker.get());
+
+ auto replica_backend = std::make_unique<ReplicatedBackend>(
+ replica_listener.get(), colls[i], chs[i], store.get(), cct);
+
+ listeners[i] = std::move(replica_listener);
+ backends[i] = std::move(replica_backend);
+ }
+
+ for (int i = 0; i < num_replicas; i++) {
+ message_router[i] = [this, i](OpRequestRef op) -> bool {
+ return backends[i]->_handle_message(op);
+ };
+ }
+
+ for (int i = 0; i < num_replicas; i++) {
+ listeners[i]->set_message_router(&message_router);
+ listeners[i]->set_handle_message_callback(
+ [this, i](OpRequestRef op) -> bool {
+ return backends[i]->_handle_message(op);
+ });
+ }
+}
+
+int PGBackendTestFixture::do_transaction_and_complete(
+ const hobject_t& hoid,
+ PGTransactionUPtr pg_t,
+ const object_stat_sum_t& delta_stats,
+ const eversion_t& at_version,
+ std::vector<pg_log_entry_t> log_entries)
+{
+ eversion_t trim_to(0, 0);
+ eversion_t pg_committed_to(0, 0);
+ std::optional<pg_hit_set_history_t> hset_history;
+
+ bool completed = false;
+ int completion_result = -1;
+ Context *on_complete = new LambdaContext([&completed, &completion_result](int r) {
+ completed = true;
+ completion_result = r;
+ });
+
+ ceph_tid_t tid = 1;
+ osd_reqid_t reqid(entity_name_t::OSD(0), 0, tid);
+
+ PGBackend* primary_backend = get_primary_backend();
+ ceph_assert(primary_backend != nullptr);
+ primary_backend->submit_transaction(
+ hoid,
+ delta_stats,
+ at_version,
+ std::move(pg_t),
+ trim_to,
+ pg_committed_to,
+ std::move(log_entries),
+ hset_history,
+ on_complete,
+ tid,
+ reqid,
+ OpRequestRef()
+ );
+
+ event_loop->run_until_idle(10000);
+
+ if (!completed) {
+ throw std::runtime_error("Transaction did not complete within timeout");
+ }
+
+ return completion_result;
+}
+
+int PGBackendTestFixture::create_and_write(
+ const std::string& obj_name,
+ const std::string& data,
+ const eversion_t& at_version)
+{
+ hobject_t hoid = make_test_object(obj_name);
+ PGTransactionUPtr pg_t = std::make_unique<PGTransaction>();
+ pg_t->create(hoid);
+
+ ObjectContextRef obc = make_object_context(hoid, false, 0);
+ pg_t->obc_map[hoid] = obc;
+
+ bufferlist bl;
+ bl.append(data);
+ pg_t->write(hoid, 0, bl.length(), bl);
+
+ object_stat_sum_t delta_stats;
+ delta_stats.num_objects = 1;
+ delta_stats.num_bytes = bl.length();
+
+ std::vector<pg_log_entry_t> log_entries;
+ pg_log_entry_t entry;
+ entry.mark_unrollbackable();
+ entry.op = pg_log_entry_t::MODIFY;
+ entry.soid = hoid;
+ entry.version = at_version;
+ entry.prior_version = eversion_t(0, 0);
+ log_entries.push_back(entry);
+
+ int result = do_transaction_and_complete(
+ hoid, std::move(pg_t), delta_stats, at_version, std::move(log_entries));
+
+ if (result == 0) {
+ obc->obs.exists = true;
+ obc->obs.oi.size = bl.length();
+ obc->obs.oi.version = at_version;
+ }
+
+ return result;
+}
+
+int PGBackendTestFixture::write(
+ const std::string& obj_name,
+ uint64_t offset,
+ const std::string& data,
+ const eversion_t& prior_version,
+ const eversion_t& at_version,
+ uint64_t object_size)
+{
+ hobject_t hoid = make_test_object(obj_name);
+ PGTransactionUPtr pg_t = std::make_unique<PGTransaction>();
+
+ ObjectContextRef obc = make_object_context(hoid, true, object_size);
+ obc->obs.oi.version = prior_version;
+ pg_t->obc_map[hoid] = obc;
+
+ bufferlist bl;
+ bl.append(data);
+ pg_t->write(hoid, offset, bl.length(), bl);
+
+ object_stat_sum_t delta_stats;
+ uint64_t new_size = std::max(object_size, offset + bl.length());
+ if (new_size > object_size) {
+ delta_stats.num_bytes = new_size - object_size;
+ } else {
+ delta_stats.num_bytes = 0;
+ }
+
+ std::vector<pg_log_entry_t> log_entries;
+ pg_log_entry_t entry;
+ // Don't mark as unrollbackable - partial writes need rollback support
+ entry.op = pg_log_entry_t::MODIFY;
+ entry.soid = hoid;
+ entry.version = at_version;
+ entry.prior_version = prior_version;
+ log_entries.push_back(entry);
+
+ int result = do_transaction_and_complete(
+ hoid, std::move(pg_t), delta_stats, at_version, std::move(log_entries));
+
+ if (result == 0) {
+ obc->obs.oi.size = new_size;
+ obc->obs.oi.version = at_version;
+ }
+
+ return result;
+}
+
+int PGBackendTestFixture::read_object(
+ const std::string& obj_name,
+ uint64_t offset,
+ uint64_t length,
+ bufferlist& out_data,
+ uint64_t object_size)
+{
+ hobject_t hoid = make_test_object(obj_name);
+
+ if (pool_type == EC) {
+ bool completed = false;
+ int completion_result = -1;
+
+ std::list<std::pair<ec_align_t, std::pair<bufferlist*, Context*>>> to_read;
+
+ ec_align_t align(offset, length, 0);
+
+ Context *read_complete = new LambdaContext([&completed, &completion_result](int r) {
+ completed = true;
+ completion_result = r;
+ });
+
+ to_read.push_back(std::make_pair(align, std::make_pair(&out_data, read_complete)));
+
+ Context *on_complete = new LambdaContext([](int r) {
+ });
+
+ PGBackend* primary_backend = get_primary_backend();
+ ceph_assert(primary_backend != nullptr);
+ ECSwitch* ec_switch = dynamic_cast<ECSwitch*>(primary_backend);
+ ceph_assert(ec_switch != nullptr);
+
+ ec_switch->objects_read_async(
+ hoid,
+ object_size,
+ to_read,
+ on_complete,
+ false
+ );
+
+ event_loop->run_until_idle(10000);
+
+ if (!completed) {
+ throw std::runtime_error("Read operation did not complete within timeout");
+ }
+
+ return completion_result;
+ } else {
+ PGBackend* primary_backend = get_primary_backend();
+ ceph_assert(primary_backend != nullptr);
+ ReplicatedBackend* rep_backend = dynamic_cast<ReplicatedBackend*>(primary_backend);
+ ceph_assert(rep_backend != nullptr);
+
+ int result = rep_backend->objects_read_sync(
+ hoid,
+ offset,
+ length,
+ 0,
+ &out_data
+ );
+
+ return result;
+ }
+}
+
+// ---------------------------------------------------------------------------
+// NOTE: update_osdmap() intentionally does NOT reconcile listener acting sets
+//
+// This method updates only:
+// - The fixture's osdmap pointer
+// - The osdmap reference in all listeners
+//
+// It does NOT update the following fields on any MockPGBackendListener:
+// - shardset
+// - acting_recovery_backfill_shard_id_set
+// - shard_info
+// - shard_missing
+//
+// This is intentional: those fields describe the acting set as seen by each
+// individual OSD, and their correct values depend on the specific failure
+// scenario being simulated. Updating them blindly here would hide bugs and
+// make it impossible to test partial-failure cases.
+//
+// Callers that need to simulate an OSD failure MUST update those fields
+// themselves before (or after) calling update_osdmap().
+//
+// See TestECFailover::simulate_osd_failure() for a worked example that
+// removes the failed shard from shardset and
+// acting_recovery_backfill_shard_id_set on every listener before delegating
+// to update_osdmap().
+// ---------------------------------------------------------------------------
+void PGBackendTestFixture::update_osdmap(
+ std::shared_ptr<OSDMap> new_osdmap,
+ std::optional<pg_shard_t> new_primary)
+{
+ // Step 1: Call on_change() on all backends to clear in-flight operations
+ for (auto& [instance, be] : backends) {
+ if (be) {
+ be->on_change();
+ }
+ }
+
+ // Step 2: Update the osdmap reference
+ osdmap = new_osdmap;
+
+ // Step 3: Update the osdmap in all listeners
+ for (auto& [instance, list] : listeners) {
+ if (list) {
+ list->osdmap = new_osdmap;
+ }
+ }
+}
+
+void PGBackendTestFixture::cleanup_data_dir()
+{
+ // Only clean up if the directory exists and hasn't been cleaned already
+ if (!data_dir.empty() && std::filesystem::exists(data_dir)) {
+ std::error_code ec;
+ std::filesystem::remove_all(data_dir, ec);
+ // Silently ignore errors during cleanup - we tried our best
+ }
+}
+
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 sts=2 expandtab
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2026 IBM
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include <filesystem>
+#include <memory>
+#include <random>
+#include <sstream>
+#include <iomanip>
+#include <gtest/gtest.h>
+#include "common/errno.h"
+#include "test/osd/MockErasureCode.h"
+#include "test/osd/MockPGBackendListener.h"
+#include "test/osd/EventLoop.h"
+#include "common/TrackedOp.h"
+#include "os/memstore/MemStore.h"
+#include "osd/ECSwitch.h"
+#include "osd/ECExtentCache.h"
+#include "osd/ReplicatedBackend.h"
+#include "osd/PGBackend.h"
+#include "osd/OSDMap.h"
+#include "osd/osd_types.h"
+#include "osd/PGTransaction.h"
+#include "common/ceph_context.h"
+#include "os/ObjectStore.h"
+#include "erasure-code/ErasureCodePlugin.h"
+#include "test/osd/OSDMapTestHelpers.h"
+
+// Unified test fixture for EC and Replicated backend tests with ObjectStore.
+// Uses PoolType to branch between EC (ECSwitch) and Replicated (ReplicatedBackend).
+class PGBackendTestFixture : public ::testing::Test {
+public:
+ enum PoolType {
+ EC,
+ REPLICATED
+ };
+
+protected:
+ PoolType pool_type;
+
+ // Pool flags to set on the EC pool (e.g., FLAG_EC_OVERWRITES, FLAG_EC_OPTIMIZATIONS).
+ // Derived classes can set this before SetUp() to configure the pool flags.
+ // setup_ec_pool() uses this value when creating the pool.
+ // Default includes both OVERWRITES and OPTIMIZATIONS flags.
+ uint64_t pool_flags = pg_pool_t::FLAG_EC_OVERWRITES | pg_pool_t::FLAG_EC_OPTIMIZATIONS;
+
+ std::unique_ptr<MemStore> store;
+ std::string data_dir;
+ ObjectStore::CollectionHandle ch;
+ coll_t coll;
+
+ std::shared_ptr<OSDMap> osdmap;
+ std::unique_ptr<OpTracker> op_tracker;
+ std::unique_ptr<EventLoop> event_loop;
+ std::map<int, std::function<bool(OpRequestRef)>> message_router;
+
+ std::map<int, std::unique_ptr<MockPGBackendListener>> listeners;
+ std::map<int, std::unique_ptr<PGBackend>> backends;
+ std::map<int, coll_t> colls;
+ std::map<int, ObjectStore::CollectionHandle> chs;
+
+ /**
+ * Optional listener factory callback.
+ *
+ * If set, setup_ec_pool() and setup_replicated_pool() will call this
+ * factory instead of constructing MockPGBackendListener directly.
+ * The factory receives the instance index and the parameters needed to
+ * construct the listener, and must return a unique_ptr to the new
+ * MockPGBackendListener. The returned object is stored in listeners[i]
+ * as usual, so ownership stays with the base class.
+ *
+ * Derived classes (e.g. ECPeeringTestFixture) can set this in their
+ * constructor to gain direct access to the created listeners without
+ * needing to steal ownership via release_listener().
+ */
+ std::function<std::unique_ptr<MockPGBackendListener>(
+ int instance,
+ std::shared_ptr<OSDMap> osdmap,
+ int64_t pool_id,
+ DoutPrefixProvider* dpp,
+ pg_shard_t whoami)> listener_factory;
+
+ ceph::ErasureCodeInterfaceRef ec_impl;
+ std::map<int, std::unique_ptr<ECExtentCache::LRU>> lrus;
+ int k = 4; // data chunks
+ int m = 2; // coding chunks
+ uint64_t stripe_unit = 4096; // aka chunk_size
+ std::string ec_plugin = "isa";
+ std::string ec_technique = "reed_sol_van";
+
+ int num_replicas = 3;
+ int min_size = 2;
+
+ int64_t pool_id = 0;
+ pg_t pgid;
+ spg_t spgid;
+
+ class TestDpp : public NoDoutPrefix {
+ public:
+ TestDpp(CephContext *cct) : NoDoutPrefix(cct, ceph_subsys_osd) {}
+
+ std::ostream& gen_prefix(std::ostream& out) const override {
+ out << "PGBackendTest: ";
+ return out;
+ }
+ };
+ std::unique_ptr<TestDpp> dpp;
+
+public:
+ explicit PGBackendTestFixture(PoolType type = EC) : pool_type(type)
+ {
+ std::random_device rd;
+ std::mt19937_64 gen(rd());
+ std::uniform_int_distribution<uint64_t> dis;
+ uint64_t random_num = dis(gen);
+
+ std::ostringstream oss;
+ oss << "memstore_test_" << std::hex << std::setfill('0') << std::setw(16) << random_num;
+ data_dir = oss.str();
+
+ ceph_assert(stripe_unit % 4096 == 0);
+ ceph_assert(stripe_unit != 0);
+ }
+
+ ~PGBackendTestFixture() {
+ // Ensure cleanup happens even if TearDown() wasn't called or failed
+ cleanup_data_dir();
+ }
+
+ void SetUp() override {
+ int r = ::mkdir(data_dir.c_str(), 0777);
+ if (r < 0) {
+ r = -errno;
+ std::cerr << __func__ << ": unable to create " << data_dir << ": " << cpp_strerror(r) << std::endl;
+ }
+ ASSERT_EQ(0, r);
+
+ // Create MemStore - contexts are stolen by MockPGBackendListener, so we don't need manual_finisher
+ store.reset(new MemStore(g_ceph_context, data_dir));
+ ASSERT_TRUE(store);
+ ASSERT_EQ(0, store->mkfs());
+ ASSERT_EQ(0, store->mount());
+
+ g_conf().set_safe_to_start_threads();
+
+ CephContext *cct = g_ceph_context;
+ dpp = std::make_unique<TestDpp>(cct);
+ event_loop = std::make_unique<EventLoop>(false);
+ op_tracker = std::make_unique<OpTracker>(cct, false, 1);
+
+ if (pool_type == EC) {
+ setup_ec_pool();
+ } else {
+ setup_replicated_pool();
+ }
+ }
+
+ void TearDown() override {
+ // 0. Process any remaining events in the EventLoop.
+ // If the test passed, orphaned events indicate a bug - warn and skip draining
+ // so the test fails loudly. If the test already failed, drain silently to
+ // allow the rest of TearDown to complete without cascading errors.
+ if (event_loop) {
+ if (event_loop->has_events()) {
+ if (!HasFailure()) {
+ ADD_FAILURE() << "TearDown: " << event_loop->queued_event_count()
+ << " orphaned events remain after a passing test";
+ }
+ event_loop->run_until_idle(1000);
+ }
+ }
+
+ // 1. Clean up all backend instances (polymorphic cleanup)
+ // Note: We skip calling on_change() during teardown as it may access
+ // invalid state. The backends will be destroyed anyway.
+ backends.clear();
+
+ // 2. Clean up EC-specific resources
+ if (pool_type == EC) {
+ lrus.clear();
+ ec_impl.reset();
+ }
+
+ // 3. Clean up listeners
+ listeners.clear();
+
+ // 4. Reset op tracker (call on_shutdown first)
+ if (op_tracker) {
+ op_tracker->on_shutdown();
+ op_tracker.reset();
+ }
+
+ // 5. Reset all collection handles
+ chs.clear();
+ colls.clear();
+
+ if (ch) {
+ ch.reset();
+ }
+
+ // 6. Unmount and destroy the store
+ if (store) {
+ store->umount();
+ store.reset();
+ }
+
+ // 7. Clean up the test directory
+ cleanup_data_dir();
+ }
+
+private:
+ void setup_ec_pool();
+ void setup_replicated_pool();
+ void cleanup_data_dir();
+
+public:
+ const pg_pool_t& get_pool() const {
+ const pg_pool_t* pool = OSDMapTestHelpers::get_pool(osdmap, pool_id);
+ ceph_assert(pool != nullptr);
+ return *pool;
+ }
+
+ int get_instance_count() const {
+ return pool_type == EC ? (k + m) : num_replicas;
+ }
+
+ int get_data_chunk_count() const {
+ return k;
+ }
+
+ int get_coding_chunk_count() const {
+ return m;
+ }
+
+ uint64_t get_stripe_width() const {
+ return stripe_unit * k;
+ }
+
+ int get_min_size() const {
+ return min_size;
+ }
+
+ // Get the primary listener and backend by checking which listener reports itself as primary
+ virtual MockPGBackendListener* get_primary_listener() {
+ for (auto& [instance, listener] : listeners) {
+ if (listener && listener->pgb_is_primary()) {
+ return listener.get();
+ }
+ }
+ return nullptr;
+ }
+
+ virtual PGBackend* get_primary_backend() {
+ for (auto& [instance, listener] : listeners) {
+ if (listener && listener->pgb_is_primary()) {
+ auto it = backends.find(instance);
+ return (it != backends.end()) ? it->second.get() : nullptr;
+ }
+ }
+ return nullptr;
+ }
+
+ hobject_t make_test_object(const std::string& name) const {
+ return hobject_t(object_t(name), "", CEPH_NOSNAP, 0, pool_id, "");
+ }
+
+ ObjectContextRef make_object_context(
+ const hobject_t& hoid,
+ bool exists = false,
+ uint64_t size = 0) const
+ {
+ ObjectContextRef obc = std::make_shared<ObjectContext>();
+ obc->obs.oi = object_info_t(hoid);
+ obc->obs.oi.size = size;
+ obc->obs.exists = exists;
+ obc->ssc = nullptr;
+ return obc;
+ }
+
+ int do_transaction_and_complete(
+ const hobject_t& hoid,
+ PGTransactionUPtr pg_t,
+ const object_stat_sum_t& delta_stats,
+ const eversion_t& at_version,
+ std::vector<pg_log_entry_t> log_entries);
+
+ virtual int create_and_write(
+ const std::string& obj_name,
+ const std::string& data,
+ const eversion_t& at_version = eversion_t(1, 1));
+
+public:
+
+ int write(
+ const std::string& obj_name,
+ uint64_t offset,
+ const std::string& data,
+ const eversion_t& prior_version,
+ const eversion_t& at_version,
+ uint64_t object_size);
+
+ int read_object(
+ const std::string& obj_name,
+ uint64_t offset,
+ uint64_t length,
+ bufferlist& out_data,
+ uint64_t object_size);
+
+ /**
+ * Update the OSDMap and trigger backend cleanup.
+ *
+ * Calls on_change() on all backends, then updates the osdmap reference in
+ * the fixture and all listeners. Optionally updates the primary field on
+ * every MockPGBackendListener and the convenience pointers (listener, backend).
+ *
+ * Does NOT update acting-set fields (shardset,
+ * acting_recovery_backfill_shard_id_set, shard_info, shard_missing) on any
+ * listener — those depend on the specific failure scenario being simulated
+ * and must be updated by the caller. See TestECFailover::simulate_osd_failure()
+ * for a worked example.
+ */
+ virtual void update_osdmap(
+ std::shared_ptr<OSDMap> new_osdmap,
+ std::optional<pg_shard_t> new_primary = std::nullopt);
+
+};
+
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 sts=2 expandtab
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2026 IBM
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+/*
+ * TestBackendBasics - Unified parameterized test harness for EC and Replicated
+ * backend operations.
+ *
+ * Two fixture classes are defined, each parameterized over the full set of
+ * backend configurations:
+ *
+ * TestBackendBasics
+ * Parameterized over BackendWriteReadParam (BackendConfig × WriteReadParam).
+ * 13 backends × 8 data sizes = 104 instances per test body.
+ *
+ * WriteThenRead – write data, verify protocol messages, read back, verify
+ * data integrity.
+ * PartialWrite – create an object, perform a partial write at a non-zero
+ * offset, read back and verify all three regions.
+ *
+ * TestECFailover
+ * Parameterized over BackendConfig (EC configs only, 12 instances).
+ * Failover is an EC-specific concept (shard-based primary election).
+ *
+ * BasicOSDMapUpdate – write, update OSDMap epoch, verify read still works.
+ * PrimaryFailover – write, fail OSD 0, verify new primary and degraded
+ * read with EC reconstruction.
+ */
+
+#include <gtest/gtest.h>
+#include "test/osd/PGBackendTestFixture.h"
+#include "test/osd/TestCommon.h"
+#include "messages/MOSDECSubOpWrite.h"
+
+using namespace std;
+
+// ---------------------------------------------------------------------------
+// TestBackendBasics fixture
+// ---------------------------------------------------------------------------
+
+/**
+ * TestBackendBasics - single fixture parameterized over BackendWriteReadParam.
+ *
+ * The constructor reads the BackendConfig portion of the parameter and
+ * configures the base fixture fields (pool_type, k, m, stripe_unit, ec_plugin,
+ * ec_technique, ec_optimizations, num_replicas, min_size) before SetUp() is
+ * called by GTest.
+ */
+class TestBackendBasics : public PGBackendTestFixture,
+ public ::testing::WithParamInterface<BackendWriteReadParam> {
+public:
+ TestBackendBasics() : PGBackendTestFixture() {
+ const auto& config = GetParam().backend;
+ pool_type = config.pool_type;
+ if (pool_type == EC) {
+ k = config.k;
+ m = config.m;
+ stripe_unit = config.stripe_unit;
+ ec_plugin = config.ec_plugin;
+ ec_technique = config.ec_technique;
+ pool_flags = config.pool_flags;
+ } else {
+ num_replicas = 3;
+ min_size = 2;
+ }
+ }
+
+ void SetUp() override {
+ PGBackendTestFixture::SetUp();
+ }
+};
+
+// ---------------------------------------------------------------------------
+// TestBackendBasics: WriteThenRead
+// ---------------------------------------------------------------------------
+
+/**
+ * WriteThenRead - write data of the parameterized size, verify protocol
+ * messages were sent, read back, and verify data integrity.
+ *
+ * For EC backends: asserts that MSG_OSD_EC_WRITE messages were sent and that
+ * read messages are sent to shards.
+ * For Replicated backends: asserts that at least one message was sent.
+ */
+TEST_P(TestBackendBasics, WriteThenRead) {
+ const auto& param = GetParam().write_read;
+ const auto& backend_config = GetParam().backend;
+
+ std::string test_data(param.size, param.fill);
+ std::string obj_name = "test_backend_" + backend_config.label + "_" + param.label;
+
+ // Execute create+write operation
+ int result = create_and_write(obj_name, test_data);
+ EXPECT_EQ(result, 0) << param.label << " write should complete successfully";
+
+ // Verify messages were sent to replicas/shards
+ auto* primary_listener = get_primary_listener();
+ ASSERT_TRUE(primary_listener != nullptr) << "Primary listener should exist";
+ ASSERT_GT(primary_listener->sent_messages.size(), 0u)
+ << "Should send messages to replicas/shards";
+
+ // For EC backends: verify EC write messages were sent
+ if (backend_config.pool_type == EC) {
+ int write_messages_sent = 0;
+ for (auto msg : primary_listener->sent_messages) {
+ if (msg->get_type() == MSG_OSD_EC_WRITE) {
+ write_messages_sent++;
+ }
+ }
+ ASSERT_GT(write_messages_sent, 0) << "Should send EC write messages";
+ }
+
+ // Clear sent messages before read to distinguish read messages
+ primary_listener->sent_messages.clear();
+ primary_listener->sent_messages_with_dest.clear();
+
+ // Perform the read operation
+ bufferlist read_data;
+ int read_result = read_object(
+ obj_name,
+ 0, // offset
+ test_data.length(), // length
+ read_data,
+ test_data.length() // object_size
+ );
+
+ EXPECT_GE(read_result, 0) << param.label << " read should complete successfully";
+
+ // Verify data length
+ ASSERT_EQ(read_data.length(), test_data.length())
+ << param.label << " read data length should match written data length";
+
+ // Verify data content
+ std::string read_string(read_data.c_str(), read_data.length());
+ EXPECT_EQ(read_string, test_data)
+ << param.label << " read data should match written data";
+
+ // For EC backends: verify read messages were sent to shards
+ if (backend_config.pool_type == EC) {
+ primary_listener = get_primary_listener();
+ ASSERT_TRUE(primary_listener != nullptr) << "Primary listener should exist";
+ ASSERT_GT(primary_listener->sent_messages.size(), 0u)
+ << "Should send read messages to EC shards";
+ }
+
+ // All events should be processed by now
+ ASSERT_FALSE(event_loop->has_events()) << "Event loop should be idle after read";
+
+ primary_listener = get_primary_listener();
+ if (primary_listener) {
+ primary_listener->sent_messages.clear();
+ }
+}
+
+// ---------------------------------------------------------------------------
+// TestBackendBasics: PartialWrite
+// ---------------------------------------------------------------------------
+
+/**
+ * PartialWrite - create an object of the parameterized size (rounded up to a
+ * multiple of the stripe width for EC, or used directly for replicated), write
+ * a partial region at a non-zero offset, read back and verify that:
+ * - the region before the partial write is unchanged,
+ * - the partial-write region contains the new data,
+ * - the region after the partial write is unchanged.
+ */
+TEST_P(TestBackendBasics, PartialWrite) {
+ const auto& param = GetParam().write_read;
+ const auto& backend_config = GetParam().backend;
+
+ std::string obj_name = "test_partial_" + backend_config.label + "_" + param.label;
+
+ // Use the parameterized size as the initial object size, but ensure it is
+ // large enough to accommodate a non-trivial partial write. We need at least
+ // 3 regions: prefix, modified, suffix. Use max(param.size, 3 * 4096) so
+ // that even the smallest size parameters produce a meaningful test.
+ const size_t initial_size = std::max(param.size, size_t(3 * 4096));
+
+ // Partial write covers the middle third of the object (aligned to 4 KB).
+ const size_t region = (initial_size / 3) & ~size_t(4095); // round down to 4 KB
+ const size_t partial_offset = region ? region : 4096;
+ const size_t partial_size = region ? region : 4096;
+
+ // Create initial data filled with the parameterized fill character
+ std::string initial_data(initial_size, param.fill);
+
+ int result = create_and_write(obj_name, initial_data, eversion_t(1, 1));
+ EXPECT_EQ(result, 0) << param.label << " initial write should complete successfully";
+
+ // Partial write data uses the next fill character (wraps around 'z' -> 'a')
+ char partial_fill = (param.fill == 'z') ? 'a' : (param.fill + 1);
+ std::string partial_data(partial_size, partial_fill);
+
+ result = write(
+ obj_name,
+ partial_offset,
+ partial_data,
+ eversion_t(1, 1), // prior_version
+ eversion_t(1, 2), // at_version
+ initial_size // object_size
+ );
+ EXPECT_EQ(result, 0) << param.label << " partial write should complete successfully";
+
+ // Read back the entire object
+ bufferlist read_data;
+ int read_result = read_object(obj_name, 0, initial_size, read_data, initial_size);
+ EXPECT_GE(read_result, 0)
+ << param.label << " read after partial write should complete successfully";
+
+ ASSERT_EQ(read_data.length(), initial_size)
+ << param.label << " read data length should match object size";
+
+ const char* buf = read_data.c_str();
+
+ // Region before the partial write should be unchanged
+ for (size_t i = 0; i < partial_offset; i++) {
+ ASSERT_EQ(buf[i], param.fill)
+ << param.label << " data before partial write offset should be unchanged at position " << i;
+ }
+
+ // Partial-write region should contain the new fill character
+ for (size_t i = partial_offset; i < partial_offset + partial_size; i++) {
+ ASSERT_EQ(buf[i], partial_fill)
+ << param.label << " data at partial write region should be '" << partial_fill
+ << "' at position " << i;
+ }
+
+ // Region after the partial write should be unchanged
+ for (size_t i = partial_offset + partial_size; i < initial_size; i++) {
+ ASSERT_EQ(buf[i], param.fill)
+ << param.label << " data after partial write region should be unchanged at position " << i;
+ }
+}
+
+// ---------------------------------------------------------------------------
+// TestBackendBasics: DirectRead
+// ---------------------------------------------------------------------------
+
+/**
+ * DirectRead - test EC direct reads to individual shards.
+ *
+ * This test:
+ * 1. Skips non-optimized EC (we don't support sync reads there)
+ * 2. Writes patterned data covering an entire stripe
+ * 3. Performs sync reads to each data shard with EC_DIRECT_READ flag
+ * 4. Verifies data integrity for each shard
+ */
+TEST_P(TestBackendBasics, DirectRead) {
+ const auto& param = GetParam().write_read;
+ const auto& backend_config = GetParam().backend;
+
+ // Skip test for non-EC backends
+ if (backend_config.pool_type != EC) {
+ GTEST_SKIP() << "DirectRead test only applies to EC backends";
+ }
+
+ // Skip test for non-optimized EC - we don't support sync reads
+ if (!(backend_config.pool_flags & pg_pool_t::FLAG_EC_OPTIMIZATIONS)) {
+ GTEST_SKIP() << "DirectRead test requires optimized EC";
+ }
+
+ std::string obj_name = "test_direct_read_" + backend_config.label + "_" + param.label;
+
+ // Get stripe width from the pool
+ uint64_t stripe_width = get_stripe_width();
+
+ // Create patterned data where each stripe_unit has a distinct pattern
+ // This allows us to verify we're reading the correct shard
+ std::string test_data;
+ test_data.reserve(stripe_width);
+
+ for (size_t i = 0; i < stripe_width; i++) {
+ // Pattern: each stripe_unit gets a different character based on its shard position
+ size_t shard_index = i / stripe_unit;
+ char fill_char = 'A' + (shard_index % 26);
+ test_data.push_back(fill_char);
+ }
+
+ // Write the data (one full stripe)
+ int result = create_and_write(obj_name, test_data);
+ EXPECT_EQ(result, 0) << param.label << " write should complete successfully";
+
+ hobject_t hoid = make_test_object(obj_name);
+
+ // Perform direct reads to each data shard (skip coding shards)
+ for (auto& [shard_id, backend] : backends) {
+ // Skip coding shards - only test data shards
+ if (shard_id >= k) {
+ continue;
+ }
+
+ ASSERT_TRUE(backend != nullptr) << "Backend for shard " << shard_id << " should not be null";
+
+ ECSwitch* ec_switch = dynamic_cast<ECSwitch*>(backend.get());
+ ASSERT_TRUE(ec_switch != nullptr) << "Backend should be ECSwitch for EC pools";
+
+ bufferlist shard_data;
+
+ // Perform sync read with EC_DIRECT_READ flag
+ // Read the entire stripe - we expect only this shard's data back
+ int read_result = ec_switch->objects_read_sync(
+ hoid,
+ 0, // offset
+ stripe_width, // length (full stripe)
+ CEPH_OSD_RMW_FLAG_EC_DIRECT_READ, // op_flags with direct read flag
+ &shard_data
+ );
+
+ EXPECT_GE(read_result, 0)
+ << param.label << " direct read to shard " << shard_id << " should complete successfully";
+
+ // For direct reads, we expect to get back only the data for this shard
+ // which is one stripe_unit
+ ASSERT_EQ(shard_data.length(), stripe_unit)
+ << param.label << " shard " << shard_id << " should return " << stripe_unit << " bytes";
+
+ // Verify data integrity: this shard should contain the expected pattern
+ const char* buf = shard_data.c_str();
+ char expected_char = 'A' + (shard_id % 26);
+
+ for (size_t i = 0; i < stripe_unit; i++) {
+ ASSERT_EQ(buf[i], expected_char)
+ << param.label << " shard " << shard_id << " byte " << i
+ << " should be '" << expected_char << "'";
+ }
+ }
+
+ // Clean up
+ auto* primary_listener = get_primary_listener();
+ if (primary_listener) {
+ primary_listener->sent_messages.clear();
+ }
+}
+
+// ---------------------------------------------------------------------------
+// Backend configurations and size parameters
+// ---------------------------------------------------------------------------
+
+namespace {
+
+const std::vector<BackendConfig> kBackendConfigs = {
+ {PGBackendTestFixture::REPLICATED, "", "", 0, 4096, 4, 2, "Replicated"},
+ {PGBackendTestFixture::EC, "isa", "reed_sol_van", pg_pool_t::FLAG_EC_OVERWRITES | pg_pool_t::FLAG_EC_OPTIMIZATIONS, 4096, 4, 2, "EC_ISA_Opt_k4m2_su4k"},
+ {PGBackendTestFixture::EC, "isa", "reed_sol_van", pg_pool_t::FLAG_EC_OVERWRITES | pg_pool_t::FLAG_EC_OPTIMIZATIONS, 8192, 4, 2, "EC_ISA_Opt_k4m2_su8k"},
+ {PGBackendTestFixture::EC, "isa", "reed_sol_van", pg_pool_t::FLAG_EC_OVERWRITES | pg_pool_t::FLAG_EC_OPTIMIZATIONS, 16384, 4, 2, "EC_ISA_Opt_k4m2_su16k"},
+ {PGBackendTestFixture::EC, "isa", "reed_sol_van", pg_pool_t::FLAG_EC_OVERWRITES | pg_pool_t::FLAG_EC_OPTIMIZATIONS, 4096, 2, 1, "EC_ISA_Opt_k2m1_su4k"},
+ {PGBackendTestFixture::EC, "isa", "reed_sol_van", pg_pool_t::FLAG_EC_OVERWRITES | pg_pool_t::FLAG_EC_OPTIMIZATIONS, 4096, 8, 3, "EC_ISA_Opt_k8m3_su4k"},
+ {PGBackendTestFixture::EC, "isa", "reed_sol_van", pg_pool_t::FLAG_EC_OVERWRITES, 4096, 4, 2, "EC_ISA_NonOpt_k4m2_su4k"},
+ {PGBackendTestFixture::EC, "jerasure", "reed_sol_van", pg_pool_t::FLAG_EC_OVERWRITES | pg_pool_t::FLAG_EC_OPTIMIZATIONS, 4096, 4, 2, "EC_Jerasure_Opt_k4m2_su4k"},
+ {PGBackendTestFixture::EC, "jerasure", "reed_sol_van", pg_pool_t::FLAG_EC_OVERWRITES | pg_pool_t::FLAG_EC_OPTIMIZATIONS, 8192, 4, 2, "EC_Jerasure_Opt_k4m2_su8k"},
+ {PGBackendTestFixture::EC, "jerasure", "reed_sol_van", pg_pool_t::FLAG_EC_OVERWRITES | pg_pool_t::FLAG_EC_OPTIMIZATIONS, 16384, 4, 2, "EC_Jerasure_Opt_k4m2_su16k"},
+ {PGBackendTestFixture::EC, "jerasure", "reed_sol_van", pg_pool_t::FLAG_EC_OVERWRITES | pg_pool_t::FLAG_EC_OPTIMIZATIONS, 4096, 2, 1, "EC_Jerasure_Opt_k2m1_su4k"},
+ {PGBackendTestFixture::EC, "jerasure", "reed_sol_van", pg_pool_t::FLAG_EC_OVERWRITES | pg_pool_t::FLAG_EC_OPTIMIZATIONS, 4096, 8, 3, "EC_Jerasure_Opt_k8m3_su4k"},
+ {PGBackendTestFixture::EC, "jerasure", "reed_sol_van", pg_pool_t::FLAG_EC_OVERWRITES, 4096, 4, 2, "EC_Jerasure_NonOpt_k4m2_su4k"},
+};
+
+const std::vector<WriteReadParam> kSizeParams = {
+ {4 * 1024, 'A', "4k"},
+ {8 * 1024, 'B', "8k"},
+ {12 * 1024, 'C', "12k"},
+ {12 * 1024 + 512, 'D', "12_5k"},
+ {16 * 1024, 'E', "16k"},
+ {31 * 1024 + 512, 'F', "31_5k"},
+ {32 * 1024, 'G', "32k"},
+ {32 * 1024 + 512, 'H', "32_5k"},
+};
+
+/**
+ * Build the cross-product of kBackendConfigs × kSizeParams.
+ */
+std::vector<BackendWriteReadParam> make_cross_product() {
+ std::vector<BackendWriteReadParam> result;
+ result.reserve(kBackendConfigs.size() * kSizeParams.size());
+ for (const auto& backend : kBackendConfigs) {
+ for (const auto& size : kSizeParams) {
+ result.push_back({backend, size});
+ }
+ }
+ return result;
+}
+
+} // namespace
+
+// ---------------------------------------------------------------------------
+// Instantiate TestBackendBasics with the full cross-product
+// ---------------------------------------------------------------------------
+
+INSTANTIATE_TEST_SUITE_P(
+ BackendSizes,
+ TestBackendBasics,
+ ::testing::ValuesIn(make_cross_product()),
+ [](const ::testing::TestParamInfo<BackendWriteReadParam>& info) {
+ return info.param.backend.label + "_" + info.param.write_read.label;
+ }
+);
+
+// ---------------------------------------------------------------------------
+// TestECFailover fixture and tests
+// ---------------------------------------------------------------------------
+
+/**
+ * TestECFailover - tests OSDMap updates and primary failover, parameterized
+ * over all EC backend configurations.
+ *
+ * Failover is an EC-specific concept (shard-based primary election), so only
+ * EC configs are included. The fixture reads k/m/stripe_unit/plugin/technique
+ * from the BackendConfig parameter so that every EC variant is exercised.
+ */
+class TestECFailover : public PGBackendTestFixture,
+ public ::testing::WithParamInterface<BackendConfig> {
+public:
+ TestECFailover() : PGBackendTestFixture(PGBackendTestFixture::EC) {
+ const auto& config = GetParam();
+ k = config.k;
+ m = config.m;
+ stripe_unit = config.stripe_unit;
+ ec_plugin = config.ec_plugin;
+ ec_technique = config.ec_technique;
+ pool_flags = config.pool_flags;
+ }
+
+ void SetUp() override {
+ PGBackendTestFixture::SetUp();
+ }
+
+ void simulate_osd_failure(int failed_osd, int new_primary_instance)
+ {
+ auto new_osdmap = std::make_shared<OSDMap>();
+ new_osdmap->deepish_copy_from(*osdmap);
+
+ // Build new acting set with the failed OSD replaced by CRUSH_ITEM_NONE
+ std::vector<int> new_acting;
+ for (int i = 0; i < k+m; i++) {
+ new_acting.push_back((i == failed_osd) ? CRUSH_ITEM_NONE : i);
+ }
+
+ // Get the pool to use pgtemp_primaryfirst transformation
+ const pg_pool_t* pool = new_osdmap->get_pg_pool(pgid.pool());
+ ceph_assert(pool != nullptr);
+
+ // For EC pools with optimizations, pgtemp_primaryfirst reorders the acting set
+ // to put primary-eligible shards first. We need to apply this transformation
+ // before setting pg_temp so that the OSDMap will correctly identify the primary.
+ std::vector<int> transformed_acting = new_osdmap->pgtemp_primaryfirst(*pool, new_acting);
+
+ // Use OSDMap::Incremental to set pg_temp with the transformed acting set
+ OSDMap::Incremental inc(new_osdmap->get_epoch() + 1);
+ inc.fsid = new_osdmap->get_fsid();
+ inc.new_state[failed_osd] = CEPH_OSD_EXISTS; // Mark as down (exists but not UP)
+
+ // Convert to mempool vector for pg_temp
+ mempool::osdmap::vector<int> pg_temp_vec(transformed_acting.begin(), transformed_acting.end());
+ inc.new_pg_temp[pgid] = pg_temp_vec;
+
+ new_osdmap->apply_incremental(inc);
+
+ // Finalize the CRUSH map to ensure working_size is calculated
+ new_osdmap->crush->finalize();
+
+ pg_shard_t failed_shard(failed_osd, shard_id_t(failed_osd));
+ for (auto& [instance_id, list] : listeners) {
+ list->shardset.erase(failed_shard);
+ list->acting_recovery_backfill_shard_id_set.erase(shard_id_t(failed_osd));
+ }
+
+ // update_osdmap will query the OSDMap to determine the primary
+ update_osdmap(new_osdmap);
+ }
+};
+
+TEST_P(TestECFailover, BasicOSDMapUpdate) {
+ const std::string obj_name = "test_failover_object";
+ const std::string test_data = "Initial data before OSDMap change";
+
+ int result = create_and_write(obj_name, test_data);
+ EXPECT_EQ(result, 0) << "Initial write should complete successfully";
+
+ bufferlist read_data;
+ int read_result = read_object(obj_name, 0, test_data.length(), read_data, test_data.length());
+ EXPECT_GE(read_result, 0) << "Read should complete successfully";
+ ASSERT_EQ(read_data.length(), test_data.length());
+
+ auto new_osdmap = std::make_shared<OSDMap>();
+ new_osdmap->deepish_copy_from(*osdmap);
+ new_osdmap->inc_epoch();
+
+ update_osdmap(new_osdmap);
+
+ EXPECT_EQ(osdmap, new_osdmap) << "OSDMap should be updated";
+ auto* primary_listener = get_primary_listener();
+ ASSERT_TRUE(primary_listener != nullptr) << "Primary listener should exist";
+ EXPECT_EQ(primary_listener->osdmap, new_osdmap) << "Listener OSDMap should be updated";
+
+ bufferlist read_data2;
+ read_result = read_object(obj_name, 0, test_data.length(), read_data2, test_data.length());
+ EXPECT_GE(read_result, 0) << "Read after OSDMap update should complete successfully";
+ ASSERT_EQ(read_data2.length(), test_data.length());
+
+ std::string read_string(read_data2.c_str(), read_data2.length());
+ EXPECT_EQ(read_string, test_data) << "Data should match after OSDMap update";
+}
+
+TEST_P(TestECFailover, PrimaryFailover) {
+ const std::string obj_name = "test_primary_failover";
+ const std::string test_data = "Data written before primary failover";
+
+ int result = create_and_write(obj_name, test_data);
+ EXPECT_EQ(result, 0) << "Initial write should complete successfully";
+
+ bufferlist read_data;
+ int read_result = read_object(obj_name, 0, test_data.length(), read_data, test_data.length());
+ EXPECT_GE(read_result, 0) << "Read should complete successfully";
+ ASSERT_EQ(read_data.length(), test_data.length());
+
+ std::string read_string(read_data.c_str(), read_data.length());
+ EXPECT_EQ(read_string, test_data) << "Data should match before failover";
+
+ EXPECT_TRUE(listeners[0]->pgb_is_primary())
+ << "Instance 0 should be primary before failover";
+ EXPECT_FALSE(listeners[k]->pgb_is_primary())
+ << "Instance " << k << " should not be primary before failover";
+
+ // Determine expected new primary based on pool optimization
+ // For optimized EC: shards 1 to k-1 are nonprimary, so new primary will be shard k
+ // For non-optimized EC: any shard can be primary, so new primary will be shard 1
+ const pg_pool_t& pool = get_pool();
+ bool is_optimized = pool.has_flag(pg_pool_t::FLAG_EC_OPTIMIZATIONS);
+ int expected_new_primary = is_optimized ? k : 1;
+
+ simulate_osd_failure(0, expected_new_primary);
+
+ EXPECT_FALSE(listeners[0]->pgb_is_primary())
+ << "Instance 0 should not be primary after failover";
+ EXPECT_TRUE(listeners[expected_new_primary]->pgb_is_primary())
+ << "Instance " << expected_new_primary << " should be primary after failover";
+
+ // Verify the query functions return the correct primary
+ auto* new_primary_listener = get_primary_listener();
+ auto* new_primary_backend = get_primary_backend();
+ EXPECT_EQ(new_primary_listener, listeners[expected_new_primary].get())
+ << "get_primary_listener() should return the new primary";
+ EXPECT_EQ(new_primary_backend, backends[expected_new_primary].get())
+ << "get_primary_backend() should return the new primary";
+
+ bufferlist read_data_after;
+ int read_result_after = read_object(obj_name, 0, test_data.length(), read_data_after, test_data.length());
+ EXPECT_GE(read_result_after, 0) << "Degraded read should complete successfully after failover";
+ ASSERT_EQ(read_data_after.length(), test_data.length());
+
+ std::string read_string_after(read_data_after.c_str(), read_data_after.length());
+ EXPECT_EQ(read_string_after, test_data) << "Data should match after failover with EC reconstruction";
+
+ EXPECT_TRUE(new_primary_listener != nullptr) << "Primary listener should exist after failover";
+ EXPECT_GT(new_primary_listener->osdmap->get_epoch(), 1)
+ << "OSDMap epoch should have incremented after failover";
+}
+
+// ---------------------------------------------------------------------------
+// Instantiate TestECFailover with EC-only backend configurations
+// ---------------------------------------------------------------------------
+
+namespace {
+
+std::vector<BackendConfig> make_ec_configs() {
+ std::vector<BackendConfig> ec_configs;
+ for (const auto& cfg : kBackendConfigs) {
+ if (cfg.pool_type == PGBackendTestFixture::EC) {
+ ec_configs.push_back(cfg);
+ }
+ }
+ return ec_configs;
+}
+
+} // namespace
+
+INSTANTIATE_TEST_SUITE_P(
+ ECBackends,
+ TestECFailover,
+ ::testing::ValuesIn(make_ec_configs()),
+ [](const ::testing::TestParamInfo<BackendConfig>& info) {
+ return info.param.label;
+ }
+);
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 sts=2 expandtab
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2026 IBM
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include <cstddef>
+#include <string>
+#include "test/osd/PGBackendTestFixture.h"
+
+/**
+ * WriteReadParam - parameter structure for write-then-read parameterized tests.
+ *
+ * Shared between test files to avoid ODR violations if both translation units
+ * are ever linked together, and to eliminate code duplication.
+ */
+struct WriteReadParam {
+ size_t size;
+ char fill;
+ std::string label;
+};
+
+/**
+ * BackendConfig - parameterizes the backend type for unified tests.
+ *
+ * Each configuration defines a pool type (EC or REPLICATED) plus
+ * EC-specific settings. The test fixture uses this to configure
+ * PGBackendTestFixture before SetUp().
+ */
+struct BackendConfig {
+ PGBackendTestFixture::PoolType pool_type;
+ // EC-specific (ignored for REPLICATED)
+ std::string ec_plugin; // e.g. "isa", "jerasure", "mock"
+ std::string ec_technique; // e.g. "reed_sol_van"
+ uint64_t pool_flags; // Pool flags (e.g., FLAG_EC_OVERWRITES | FLAG_EC_OPTIMIZATIONS)
+ uint64_t stripe_unit = 4096; // aka chunk_size; stripe_width = stripe_unit * k
+ int k = 4; // data chunks (EC only)
+ int m = 2; // coding chunks (EC only)
+ // Label for test naming
+ std::string label;
+};
+
+/**
+ * BackendWriteReadParam - combined parameter for backend + write/read size tests.
+ *
+ * Used for two-level parameterization: backend configuration × data sizes.
+ */
+struct BackendWriteReadParam {
+ BackendConfig backend;
+ WriteReadParam write_read;
+};
+
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 sts=2 expandtab
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2026 IBM
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include <gtest/gtest.h>
+#include "test/osd/ECPeeringTestFixture.h"
+
+using namespace std;
+
+class TestECFailoverWithPeering : public ECPeeringTestFixture {
+public:
+ TestECFailoverWithPeering() : ECPeeringTestFixture() {
+ k = 4;
+ m = 2;
+ stripe_unit = 4096;
+ ec_plugin = "isa";
+ ec_technique = "reed_sol_van";
+ }
+};
+
+TEST_F(TestECFailoverWithPeering, BasicPeeringCycle) {
+ run_peering_cycle();
+
+ EXPECT_TRUE(all_shards_active()) << "All shards should be active after peering";
+
+ // Note: In EC pools, only the primary tracks PG_STATE_CLEAN.
+ // Replicas are in ReplicaActive state and don't set the CLEAN flag.
+ // Get acting_primary from OSDMap
+ pg_t pgid = get_peering_state(0)->get_info().pgid.pgid;
+ std::vector<int> acting_osds;
+ int acting_primary = -1;
+ osdmap->pg_to_acting_osds(pgid, &acting_osds, &acting_primary);
+
+ EXPECT_TRUE(get_peering_state(acting_primary)->is_clean())
+ << "Primary should be clean after peering";
+
+ // Verify primary is shard 0
+ EXPECT_TRUE(get_peering_listener(0)->backend_listener->pgb_is_primary())
+ << "Shard 0 should be primary";
+
+ for (int i = 1; i < k + m; i++) {
+ EXPECT_FALSE(get_peering_listener(i)->backend_listener->pgb_is_primary())
+ << "Shard " << i << " should not be primary";
+ }
+}
+
+TEST_F(TestECFailoverWithPeering, WriteWithPeering) {
+ run_peering_cycle();
+ ASSERT_TRUE(all_shards_active()) << "Peering must complete before write";
+
+ const std::string obj_name = "test_write_with_peering";
+ const std::string test_data = "Data written with full peering support";
+
+ int result = create_and_write(obj_name, test_data);
+ EXPECT_EQ(result, 0) << "Write should complete successfully";
+
+ bufferlist read_data;
+ int read_result = read_object(obj_name, 0, test_data.length(), read_data, test_data.length());
+ EXPECT_GE(read_result, 0) << "Read should complete successfully";
+ ASSERT_EQ(read_data.length(), test_data.length());
+
+ std::string read_string(read_data.c_str(), read_data.length());
+ EXPECT_EQ(read_string, test_data) << "Data should match";
+
+ auto* primary_ps = get_peering_state(0);
+ EXPECT_GT(primary_ps->get_pg_log().get_log().log.size(), 0)
+ << "Primary should have log entries after write";
+}
+
+TEST_F(TestECFailoverWithPeering, OSDFailureWithPeering) {
+ run_peering_cycle();
+ ASSERT_TRUE(all_shards_active()) << "Initial peering must complete";
+
+ const std::string obj_name = "test_osd_failure";
+ // Write 16KB but read only 8KB to force reconstruction when shard 1 is down
+ const std::string test_data(16384, 'X'); // 16KB write
+ const size_t read_length = 8192; // 8KB read
+
+ int result = create_and_write(obj_name, test_data);
+ EXPECT_EQ(result, 0) << "Initial write should complete";
+
+ // Pre-failover read: measure baseline message count with all OSDs up
+ // Clear message counters first
+ for (auto& [shard, listener] : backend_listeners) {
+ listener->sent_messages.clear();
+ }
+
+ bufferlist pre_failover_read;
+ int pre_read_result = read_object(obj_name, 0, read_length,
+ pre_failover_read, test_data.length());
+ EXPECT_GE(pre_read_result, 0) << "Pre-failover read should complete";
+
+ // Count messages sent during pre-failover read
+ size_t pre_failover_msg_count = 0;
+ for (auto& [shard, listener] : backend_listeners) {
+ pre_failover_msg_count += listener->sent_messages.size();
+ }
+
+ int failed_osd = 1; // Fail shard 1 which contains part of the data
+
+ // Use fixture helper to mark OSD as down
+ mark_osd_down(failed_osd);
+
+ // Primary (OSD 0) should remain active after non-primary OSD failure
+ auto* primary_ps = get_peering_state(0);
+ std::string primary_state = get_state_name(0);
+ EXPECT_TRUE(primary_state.find("Peering") != std::string::npos ||
+ primary_state.find("Active") != std::string::npos)
+ << "Primary should be peering or active after OSD failure, got: " << primary_state;
+
+ EXPECT_TRUE(primary_ps->get_acting_recovery_backfill().count(pg_shard_t(failed_osd, shard_id_t(failed_osd))) == 0)
+ << "Failed OSD should not be in acting set";
+
+ // Clear message counters before post-failover read
+ for (auto& [shard, listener] : backend_listeners) {
+ listener->sent_messages.clear();
+ }
+
+ // Post-failover read: verify EC reconstruction works with one OSD down
+ bufferlist post_failover_read;
+ int post_read_result = read_object(obj_name, 0, read_length,
+ post_failover_read, test_data.length());
+ EXPECT_GE(post_read_result, 0) << "Read should complete successfully after OSD failure";
+ ASSERT_EQ(post_failover_read.length(), read_length)
+ << "Read length should match after OSD failure";
+
+ std::string read_string(post_failover_read.c_str(), post_failover_read.length());
+ std::string expected_data(read_length, 'X');
+ EXPECT_EQ(read_string, expected_data)
+ << "Data should be correctly reconstructed via EC after OSD failure";
+
+ // Count messages sent during post-failover read
+ size_t post_failover_msg_count = 0;
+ for (auto& [shard, listener] : backend_listeners) {
+ post_failover_msg_count += listener->sent_messages.size();
+ }
+
+ // This is an 8k read of a 16k object in a 4+2 array. This means that if shard 1
+ // is missing, then this should result in 4 reads, rather than 2 to recover.
+ EXPECT_GT(post_failover_msg_count, pre_failover_msg_count)
+ << "Post-failover read should complete successfully "
+ << "(pre: " << pre_failover_msg_count << ", post: " << post_failover_msg_count << ")";
+}
+
+TEST_F(TestECFailoverWithPeering, PrimaryFailoverWithPeering) {
+ run_peering_cycle();
+ ASSERT_TRUE(all_shards_active()) << "Initial peering must complete";
+
+ const std::string obj_name = "test_primary_failover";
+ const std::string test_data = "Data before primary failover";
+
+ int result = create_and_write(obj_name, test_data);
+ EXPECT_EQ(result, 0) << "Initial write should complete";
+
+ EXPECT_TRUE(get_peering_listener(0)->backend_listener->pgb_is_primary())
+ << "Shard 0 should be primary initially";
+
+ // Mark OSD 0 (the initial primary) as down
+ // PeeringState will automatically determine the new primary
+ mark_osd_down(0);
+
+ // Determine the actual new primary from the OSDMap
+ int new_primary_shard = get_primary_shard_from_osdmap();
+ ASSERT_GE(new_primary_shard, 0) << "Should have a valid new primary after failover";
+
+ // For an optimized EC pool (k=4, m=2), the new primary should be a coding shard (>= k)
+ // For a non-optimized pool, it would be shard 1
+ const pg_pool_t& pool = get_pool();
+ if (pool.allows_ecoptimizations()) {
+ EXPECT_GE(new_primary_shard, k)
+ << "New primary should be a coding shard (>= k) for optimized pool";
+ } else {
+ EXPECT_EQ(new_primary_shard, 1)
+ << "New primary should be shard 1 for non-optimized pool";
+ }
+
+ EXPECT_TRUE(get_peering_listener(new_primary_shard)->backend_listener->pgb_is_primary())
+ << "Shard " << new_primary_shard << " should be new primary";
+
+ EXPECT_FALSE(get_peering_listener(0)->backend_listener->pgb_is_primary())
+ << "Failed shard should not be primary";
+
+ std::string state = get_state_name(new_primary_shard);
+ EXPECT_TRUE(state.find("Active") != std::string::npos)
+ << "New primary should be Active after failover, got: " << state;
+
+ // Verify the PG reached Active state
+ EXPECT_TRUE(get_peering_state(new_primary_shard)->is_active())
+ << "New primary should be in Active state";
+
+ // Verify reads work after primary failover (with EC reconstruction)
+ bufferlist read_data;
+ int read_result = read_object(obj_name, 0, test_data.length(),
+ read_data, test_data.length());
+ EXPECT_GE(read_result, 0) << "Read should complete successfully after primary failover";
+ ASSERT_EQ(read_data.length(), test_data.length())
+ << "Read length should match after primary failover";
+
+ std::string read_string(read_data.c_str(), read_data.length());
+ EXPECT_EQ(read_string, test_data)
+ << "Data should be correctly reconstructed via EC after primary failover";
+}
+
+TEST_F(TestECFailoverWithPeering, MultipleOSDFailuresWithPeering) {
+ run_peering_cycle();
+ ASSERT_TRUE(all_shards_active()) << "Initial peering must complete";
+
+ const std::string obj_name = "test_multiple_failures";
+ const std::string test_data = "Data before multiple failures";
+
+ int result = create_and_write(obj_name, test_data);
+ EXPECT_EQ(result, 0) << "Initial write should complete";
+
+ std::vector<int> failed_osds = {1, 2}; // Fail 2 data shards
+ ASSERT_EQ(failed_osds.size(), static_cast<size_t>(m))
+ << "Should fail exactly m OSDs";
+
+ // Use fixture helper to mark multiple OSDs as down
+ mark_osds_down(failed_osds);
+
+ auto* primary_ps = get_peering_state(0);
+ for (int failed_osd : failed_osds) {
+ EXPECT_TRUE(primary_ps->get_acting_recovery_backfill().count(
+ pg_shard_t(failed_osd, shard_id_t(failed_osd))) == 0)
+ << "Failed OSD " << failed_osd << " should not be in acting set";
+ }
+
+ std::string primary_state = get_state_name(0);
+ EXPECT_TRUE(primary_state.find("Peering") != std::string::npos ||
+ primary_state.find("Active") != std::string::npos ||
+ primary_state.find("Recovery") != std::string::npos)
+ << "Primary should be operational, got: " << primary_state;
+}
+
+TEST_F(TestECFailoverWithPeering, PeeringWithLogDivergence) {
+ run_peering_cycle();
+ ASSERT_TRUE(all_shards_active()) << "Initial peering must complete";
+
+ const std::string pre_div_obj = "test_pre_divergence";
+ const std::string pre_div_data = "Data written before divergence";
+
+ int result = create_and_write(pre_div_obj, pre_div_data, eversion_t(1, 1));
+ EXPECT_EQ(result, 0) << "Pre-divergence write should complete";
+
+ auto* primary_ps = get_peering_state(0);
+ size_t initial_log_size = primary_ps->get_pg_log().get_log().log.size();
+ EXPECT_GT(initial_log_size, 0) << "Primary should have log entries after pre-divergence write";
+
+ // Note: get_pg_log().get_log().head reflects the log entries added via append_log
+ eversion_t pre_div_log_head = primary_ps->get_pg_log().get_log().head;
+ EXPECT_GT(pre_div_log_head.version, 0u) << "PG log head should be non-zero after write";
+
+ const std::string post_div_obj = "test_post_divergence";
+ const std::string post_div_data = "Data written after divergence point";
+
+ result = create_and_write(post_div_obj, post_div_data, eversion_t(1, 2));
+ EXPECT_EQ(result, 0) << "Post-divergence write should complete";
+
+ eversion_t post_div_log_head = primary_ps->get_pg_log().get_log().head;
+ EXPECT_GT(post_div_log_head.version, pre_div_log_head.version)
+ << "PG log head should advance after post-divergence write";
+
+ size_t log_size_after_writes = primary_ps->get_pg_log().get_log().log.size();
+ EXPECT_GE(log_size_after_writes, initial_log_size)
+ << "Primary log should have at least as many entries after second write";
+
+ // Trigger a new peering cycle by advancing the map to simulate re-peering
+ // after a shard had a divergent log.
+ advance_epoch();
+
+ std::string primary_state = get_state_name(0);
+ ASSERT_TRUE(all_shards_active() ||
+ primary_state.find("Recovery") != std::string::npos ||
+ primary_state.find("Peering") != std::string::npos)
+ << "Shards should be active, recovering, or peering after map advance, got: "
+ << primary_state;
+
+ // --- Verify pre-divergence data is readable and correct ---
+ bufferlist pre_div_read;
+ int read_result = read_object(pre_div_obj, 0, pre_div_data.length(),
+ pre_div_read, pre_div_data.length());
+ EXPECT_GE(read_result, 0) << "Pre-divergence object should be readable after reconciliation";
+ ASSERT_EQ(pre_div_read.length(), pre_div_data.length())
+ << "Pre-divergence read length should match";
+ {
+ std::string read_str(pre_div_read.c_str(), pre_div_read.length());
+ EXPECT_EQ(read_str, pre_div_data)
+ << "Pre-divergence data should match after log reconciliation";
+ }
+
+ // --- Verify post-divergence data is readable and correct ---
+ bufferlist post_div_read;
+ read_result = read_object(post_div_obj, 0, post_div_data.length(),
+ post_div_read, post_div_data.length());
+ EXPECT_GE(read_result, 0) << "Post-divergence object should be readable after reconciliation";
+ ASSERT_EQ(post_div_read.length(), post_div_data.length())
+ << "Post-divergence read length should match";
+ {
+ std::string read_str(post_div_read.c_str(), post_div_read.length());
+ EXPECT_EQ(read_str, post_div_data)
+ << "Post-divergence data should match after log reconciliation";
+ }
+
+ // After peering, the primary's PG log head should reflect all writes.
+ eversion_t primary_log_head = primary_ps->get_pg_log().get_log().head;
+ EXPECT_EQ(primary_log_head, post_div_log_head)
+ << "Primary PG log head should reflect all writes after reconciliation";
+
+ pg_t pgid = get_peering_state(0)->get_info().pgid.pgid;
+ std::vector<int> acting_osds;
+ int acting_primary = -1;
+ osdmap->pg_to_acting_osds(pgid, &acting_osds, &acting_primary);
+
+ for (int shard : acting_osds) {
+ if (shard == CRUSH_ITEM_NONE) {
+ continue;
+ }
+ auto* shard_ps = get_peering_state(shard);
+ if (shard_ps->is_active()) {
+ eversion_t shard_info_last_update = shard_ps->get_info().last_update;
+ if (shard == acting_primary) {
+ EXPECT_EQ(shard_info_last_update, post_div_log_head)
+ << "Primary shard info.last_update should match post-divergence log head";
+ } else {
+ EXPECT_LE(shard_info_last_update, post_div_log_head)
+ << "Shard " << shard << " info.last_update should not exceed primary's log head";
+ }
+ }
+ }
+
+ // Verify the formerly-failed shard's PG log is accessible and consistent.
+ // We use the last data shard (k-1) as the "formerly-failed" shard to check.
+ int reconciled_shard = k - 1;
+ if (reconciled_shard >= 0 && reconciled_shard < k + m) {
+ auto* reconciled_ps = get_peering_state(reconciled_shard);
+ size_t reconciled_log_size = reconciled_ps->get_pg_log().get_log().log.size();
+ auto* primary_ps_check = get_peering_state(acting_primary);
+ size_t primary_log_size = primary_ps_check->get_pg_log().get_log().log.size();
+ EXPECT_LE(reconciled_log_size, primary_log_size)
+ << "Reconciled shard " << reconciled_shard
+ << " log size should not exceed primary's log size";
+
+ if (reconciled_ps->is_active()) {
+ eversion_t reconciled_info_lu = reconciled_ps->get_info().last_update;
+ EXPECT_LE(reconciled_info_lu, post_div_log_head)
+ << "Reconciled shard " << reconciled_shard
+ << " info.last_update should not exceed primary's log head after log reconciliation";
+ }
+ }
+}
+
+TEST_F(TestECFailoverWithPeering, RecoveryWithPeering) {
+ run_peering_cycle();
+ ASSERT_TRUE(all_shards_active()) << "Initial peering must complete";
+
+ const std::string obj1_name = "test_recovery_obj1";
+ const std::string obj1_data = "First object data for recovery test";
+
+ const std::string obj2_name = "test_recovery_obj2";
+ const std::string obj2_data = "Second object data for recovery test";
+
+ int result = create_and_write(obj1_name, obj1_data, eversion_t(1, 1));
+ EXPECT_EQ(result, 0) << "First pre-failure write should complete";
+
+ result = create_and_write(obj2_name, obj2_data, eversion_t(1, 2));
+ EXPECT_EQ(result, 0) << "Second pre-failure write should complete";
+
+ EXPECT_TRUE(all_shards_clean()) << "All shards should be clean before recovery test";
+
+ auto* primary_ps = get_peering_state(0);
+ eversion_t pre_failure_log_head = primary_ps->get_pg_log().get_log().head;
+ EXPECT_GT(pre_failure_log_head.version, 0u)
+ << "Primary should have log entries before failure";
+
+ int failed_osd = k - 1; // Last data shard
+
+ // Use fixture helper to mark OSD as down
+ mark_osd_down(failed_osd);
+
+ std::string state_after_failure = get_state_name(0);
+ ASSERT_TRUE(all_shards_active() ||
+ state_after_failure.find("Recovery") != std::string::npos ||
+ state_after_failure.find("Peering") != std::string::npos)
+ << "PG should be active, recovering, or peering after OSD failure, got: "
+ << state_after_failure;
+
+ // EC can reconstruct data from remaining k shards even with one shard missing
+ bufferlist obj1_read;
+ int read_result = read_object(obj1_name, 0, obj1_data.length(),
+ obj1_read, obj1_data.length());
+ EXPECT_GE(read_result, 0) << "First object should be readable after OSD failure";
+ ASSERT_EQ(obj1_read.length(), obj1_data.length())
+ << "First object read length should match after failure";
+ {
+ std::string read_str(obj1_read.c_str(), obj1_read.length());
+ EXPECT_EQ(read_str, obj1_data)
+ << "First object data should be correct after OSD failure (EC reconstruction)";
+ }
+
+ bufferlist obj2_read;
+ read_result = read_object(obj2_name, 0, obj2_data.length(),
+ obj2_read, obj2_data.length());
+ EXPECT_GE(read_result, 0) << "Second object should be readable after OSD failure";
+ ASSERT_EQ(obj2_read.length(), obj2_data.length())
+ << "Second object read length should match after failure";
+ {
+ std::string read_str(obj2_read.c_str(), obj2_read.length());
+ EXPECT_EQ(read_str, obj2_data)
+ << "Second object data should be correct after OSD failure (EC reconstruction)";
+ }
+
+ const std::string post_recovery_obj = "test_post_recovery";
+ const std::string post_recovery_data = "Data written after OSD failure and recovery";
+
+ result = create_and_write(post_recovery_obj, post_recovery_data, eversion_t(1, 3));
+ EXPECT_EQ(result, 0) << "Write after OSD failure should complete successfully";
+
+ bufferlist post_recovery_read;
+ read_result = read_object(post_recovery_obj, 0, post_recovery_data.length(),
+ post_recovery_read, post_recovery_data.length());
+ EXPECT_GE(read_result, 0) << "Post-recovery object should be readable";
+ ASSERT_EQ(post_recovery_read.length(), post_recovery_data.length())
+ << "Post-recovery read length should match";
+ {
+ std::string read_str(post_recovery_read.c_str(), post_recovery_read.length());
+ EXPECT_EQ(read_str, post_recovery_data)
+ << "Post-recovery data should match what was written";
+ }
+
+ eversion_t post_recovery_log_head = primary_ps->get_pg_log().get_log().head;
+ EXPECT_GT(post_recovery_log_head.version, pre_failure_log_head.version)
+ << "Primary PG log head should advance after post-recovery write";
+
+ // Even though the OSD is "down", its PeeringState still holds the log
+ // from before it went down.
+ auto* failed_ps = get_peering_state(failed_osd);
+ EXPECT_TRUE(failed_ps != nullptr) << "Failed OSD's PeeringState should still exist";
+
+ size_t primary_log_size = primary_ps->get_pg_log().get_log().log.size();
+ size_t failed_log_size = failed_ps->get_pg_log().get_log().log.size();
+ EXPECT_LE(failed_log_size, primary_log_size)
+ << "Failed OSD's PG log size should not exceed primary's log size";
+ // The primary wrote 3 objects (obj1, obj2, post_recovery_obj), so its log must be non-empty.
+ EXPECT_GT(primary_log_size, 0u)
+ << "Primary PG log should have entries after 3 writes";
+
+ auto* listener_ptr = get_peering_listener(0);
+ EXPECT_TRUE(listener_ptr != nullptr) << "Peering listener should exist";
+ EXPECT_TRUE(listener_ptr->activate_complete_called)
+ << "on_activate_complete should have been called during peering";
+}
+
using namespace std;
-
IsPGRecoverablePredicate *get_is_recoverable_predicate() {
return new MockECRecPred();
}
return new MockECReadPred();
}
-
// Test fixture for PeeringState tests
class PeeringStateTest : public ::testing::Test {
protected:
for (auto it = ls.begin(); it != ls.end();) {
MessageRef m = *it;
it = ls.erase(it);
- // TODO : Should handle messages other than MOSDPeeringOp events, however
- // for now this seems to be sufficient
+ // NOTE: This dispatcher only handles MOSDPeeringOp-derived messages (MOSDPGLog,
+ // MOSDPGNotify2, MOSDPGInfo2, MOSDPGLease, MOSDPGLeaseAck, MOSDPGQuery2, MOSDPGTrim).
+ // Non-peering messages like MOSDPGRemove and MRecoveryReserve are sent via
+ // send_cluster_message() but are not dispatched through this function - they are
+ // handled by other test mechanisms or are not relevant to peering state transitions.
+ // This is sufficient for testing PeeringState behavior as all peering-related
+ // messages derive from MOSDPeeringOp and provide get_event() for state machine events.
+ // Future enhancement: If testing non-peering cluster messages becomes necessary,
+ // add type checking and appropriate handling for Message-derived (non-MOSDPeeringOp) types.
dout(0) << __func__ << " message type = " << m->get_type() << dendl;
MOSDPeeringOp *pm = static_cast<MOSDPeeringOp*>(m.get());
dout(0) << __func__ << " sending from osd." << fromosd << " to osd." << osd << " " << *pm << dendl;
PGPool pool(osdmap, pool_id, pi, osdmap->get_pool_name(pool_id));
dpp[osd] = make_unique<DppHelper>(g_ceph_context, dout_subsys, this, osd, shard);
spg_t spgid = spg_t(pg_t(0, pool_id), pg_whoami.shard);
- listeners[osd] = make_unique<MockPeeringListener>(osdmap, pi, get_dpp(osd), pg_whoami);
+ listeners[osd] = make_unique<MockPeeringListener>(osdmap, pool_id, get_dpp(osd), pg_whoami);
get_listener(osd)->current_epoch = osdmap->get_epoch();
unique_ptr<PeeringState> ps = make_unique<PeeringState>(
g_ceph_context,