test/osd: Add extensive new PGBackend test harness.

author Alex Ainscow <aainscow@uk.ibm.com>

Sun, 1 Mar 2026 22:33:52 +0000 (22:33 +0000)

committer Alex Ainscow <aainscow@uk.ibm.com>

Tue, 24 Mar 2026 17:33:55 +0000 (17:33 +0000)
author Alex Ainscow <aainscow@uk.ibm.com>
Sun, 1 Mar 2026 22:33:52 +0000 (22:33 +0000)
committer Alex Ainscow <aainscow@uk.ibm.com>
Tue, 24 Mar 2026 17:33:55 +0000 (17:33 +0000)
diff --git a/src/test/osd/CMakeLists.txt b/src/test/osd/CMakeLists.txt

index f4b59c8c7b5bf73f09ddf6b6fc4d9f22ade12b6b..0b7f7c7dba0aadfcaa8c898e213191c8dc14eeb4 100644 (file)
--- a/src/test/osd/CMakeLists.txt
+++ b/src/test/osd/CMakeLists.txt
@@ -81,6 +81,7 @@ target_link_libraries(unittest_ecbackend osd global)
  add_executable(unittest_ecutil
          TestECUtil.cc
          $<TARGET_OBJECTS:unit-main>
+        $<TARGET_OBJECTS:erasure_code_objs>
  )
  add_ceph_unittest(unittest_ecutil)
  target_link_libraries(unittest_ecutil osd global)
@@ -116,6 +117,38 @@ add_executable(unittest_peeringstate
    )
  add_ceph_unittest(unittest_peeringstate)
  target_link_libraries(unittest_peeringstate osd os global ${CMAKE_DL_LIBS} ${BLKID_LIBRARIES})
+# pg_backend_test_fixture: object library for PGBackendTestFixture implementation
+add_library(pg_backend_test_fixture OBJECT
+  PGBackendTestFixture.cc
+)
+target_link_libraries(pg_backend_test_fixture osd os global)
+
+# ec_peering_test_fixture: object library for ECPeeringTestFixture implementation
+add_library(ec_peering_test_fixture OBJECT
+  ECPeeringTestFixture.cc
+)
+target_link_libraries(ec_peering_test_fixture osd os global)
+
+# unittest_backend_basics (replaces unittest_ecbasics + unittest_replicatedbasics)
+add_executable(unittest_backend_basics
+  TestBackendBasics.cc
+  $<TARGET_OBJECTS:unit-main>
+  $<TARGET_OBJECTS:pg_backend_test_fixture>
+  )
+add_ceph_unittest(unittest_backend_basics)
+target_link_libraries(unittest_backend_basics osd os global ${CMAKE_DL_LIBS} ${BLKID_LIBRARIES})
+add_dependencies(unittest_backend_basics ec_isa ec_jerasure)
+# unittest_ecfailover_with_peering
+add_executable(unittest_ecfailover_with_peering
+  TestECFailoverWithPeering.cc
+  $<TARGET_OBJECTS:unit-main>
+  $<TARGET_OBJECTS:store_test_fixture>
+  $<TARGET_OBJECTS:pg_backend_test_fixture>
+  $<TARGET_OBJECTS:ec_peering_test_fixture>
+  )
+add_ceph_unittest(unittest_ecfailover_with_peering)
+target_link_libraries(unittest_ecfailover_with_peering osd os global ${CMAKE_DL_LIBS} ${BLKID_LIBRARIES})
+add_dependencies(unittest_ecfailover_with_peering ec_isa)
  # unittest_hitset
  add_executable(unittest_hitset
    hitset.cc
@@ -178,3 +211,46 @@ add_ceph_unittest(unittest_mclock_scheduler)
  target_link_libraries(unittest_mclock_scheduler
    global osd dmclock os
  )
+
+# osd_unittests: custom target that builds and runs all OSD unit tests
+# Not including unittest_osdmap, as it is slow. It is tested elsewhere.
+set(OSD_UNITTESTS
+  unittest_backend_basics
+  unittest_ec_transaction
+  unittest_ec_transaction_l
+  unittest_ecbackend
+  unittest_ecbackend_l
+  unittest_ecfailover_with_peering
+  unittest_ecutil
+  unittest_extent_cache
+  unittest_extent_cache_l
+  unittest_hitset
+  unittest_mclock_scheduler
+  unittest_osd_osdcap
+  unittest_osd_types
+  unittest_osdscrub
+  unittest_peeringstate
+  unittest_pg_transaction
+  unittest_pglog
+  unittest_scrubber_be
+)
+
+# osd_unittests: build all OSD unit tests and run them via ctest.
+# This is for development convenience only, it is not used as part of make
+# check.
+# Using ctest ensures:
+#   - All tests run even if one fails (--no-tests-on-failure continues)
+#   - Output is shown for failing tests (--output-on-failure)
+#   - Adding a new test only requires adding it to OSD_UNITTESTS above
+#   - Excludes unittest_osdmap because it is relatively slow.
+string(JOIN "|" OSD_UNITTEST_REGEX ${OSD_UNITTESTS})
+add_custom_target(osd_unittests
+  DEPENDS ${OSD_UNITTESTS}
+  COMMAND ${CMAKE_CTEST_COMMAND}
+    --test-dir ${CMAKE_BINARY_DIR}
+    -R "^(${OSD_UNITTEST_REGEX})$"
+    --output-on-failure
+  WORKING_DIRECTORY ${CMAKE_BINARY_DIR}
+  COMMENT "Building and running all OSD unit tests via ctest"
+  VERBATIM
+)
diff --git a/src/test/osd/ECPeeringTestFixture.cc b/src/test/osd/ECPeeringTestFixture.cc

new file mode 100644 (file)

index 0000000..25f031d
--- /dev/null
+++ b/src/test/osd/ECPeeringTestFixture.cc
@@ -0,0 +1,308 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 sts=2 expandtab
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2026 IBM
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "test/osd/ECPeeringTestFixture.h"
+
+PeeringState* ECPeeringTestFixture::create_peering_state(int shard)
+{
+  const pg_pool_t& pi = get_pool();
+  pg_shard_t pg_whoami(shard, shard_id_t(shard));
+  PGPool pool(osdmap, pool_id, pi, "test_pool");
+
+  shard_dpps[shard] = std::make_unique<ShardDpp>(g_ceph_context, this, shard);
+
+  shard_peering_listeners[shard] = std::make_unique<MockPeeringListener>(
+    osdmap, pool_id, shard_dpps[shard].get(), pg_whoami);
+  shard_peering_listeners[shard]->current_epoch = osdmap->get_epoch();
+
+  shard_peering_listeners[shard]->queue_transaction_callback =
+    [this, shard](ObjectStore::Transaction&& t) -> int {
+      return queue_transaction_helper(shard, std::move(t));
+    };
+
+  // Transfer ownership of the backend listener from the base class listeners[]
+  // map into the peering listener.  The factory (set in our constructor) already
+  // recorded a raw pointer in backend_listeners[] so we know which entry to move.
+  // After the move, listeners[shard] holds a null unique_ptr; TearDown() already
+  // guards against that with "if (list)".
+  shard_peering_listeners[shard]->backend_listener = std::move(listeners[shard]);
+  shard_peering_listeners[shard]->coll = colls[shard];
+  shard_peering_listeners[shard]->ch = chs[shard];
+
+  // Recreate backend with the correct backend_listener pointer.
+  // The MockPeeringListener constructor created backend with the temporary
+  // backend_listener it allocated internally, but we just replaced backend_listener
+  // with the one from the base class listeners[] map.  We must recreate backend
+  // so its parent pointer points to the new backend_listener, not the destroyed one.
+  shard_peering_listeners[shard]->backend = std::make_unique<MockPGBackend>(
+    g_ceph_context,
+    shard_peering_listeners[shard]->backend_listener.get(),
+    nullptr,
+    colls[shard],
+    chs[shard]);
+
+  spg_t spgid(pgid, shard_id_t(shard));
+  auto ps = std::make_unique<PeeringState>(
+    g_ceph_context,
+    pg_whoami,
+    spgid,
+    pool,
+    osdmap,
+    PG_FEATURE_CLASSIC_ALL,
+    shard_dpps[shard].get(),
+    shard_peering_listeners[shard].get());
+
+  shard_peering_listeners[shard]->ps = ps.get();
+  
+  ps->set_backend_predicates(
+    get_is_readable_predicate(),
+    get_is_recoverable_predicate());
+
+  shard_peering_states[shard] = std::move(ps);
+  shard_peering_listeners[shard]->backend_listener->set_peering_state(shard_peering_states[shard].get());
+  shard_peering_ctxs[shard] = std::make_unique<PeeringCtx>();
+
+  return shard_peering_states[shard].get();
+}
+
+void ECPeeringTestFixture::init_peering(bool dne)
+{
+  pg_history_t history;
+  history.same_interval_since = osdmap->get_epoch();
+  history.epoch_pool_created = osdmap->get_epoch();
+  history.last_epoch_clean = osdmap->get_epoch();
+  if (!dne) {
+    history.epoch_created = osdmap->get_epoch();
+  }
+  PastIntervals past_intervals;
+
+  // Get primary from OSDMap using base class pgid member
+  std::vector<int> up_osds, acting_osds;
+  int up_primary = -1, acting_primary = -1;
+  osdmap->pg_to_up_acting_osds(this->pgid, &up_osds, &up_primary, &acting_osds, &acting_primary);
+
+  for (int shard : acting_osds) {
+    ObjectStore::Transaction t;
+    get_peering_state(shard)->init(
+      (shard == acting_primary) ? 0 : 1,  // role
+      up_osds,
+      up_primary,
+      acting_osds,
+      acting_primary,
+      history,
+      past_intervals,
+      t);
+
+    queue_transaction_helper(shard, std::move(t));
+  }
+}
+
+void ECPeeringTestFixture::update_osdmap_with_peering(
+  std::shared_ptr<OSDMap> new_osdmap,
+  std::optional<pg_shard_t> new_primary)
+{
+  OSDMapRef old_osdmap = osdmap;
+
+  update_osdmap(new_osdmap, new_primary);
+
+  // Update peering listeners for ALL shards (even failed ones need epoch updates)
+  for (auto& [shard, listener] : shard_peering_listeners) {
+    listener->current_epoch = new_osdmap->get_epoch();
+  }
+
+  // Get primary from OSDMap for advance_map calls using base class pgid member
+  std::vector<int> up_osds, acting_osds;
+  int up_primary = -1, acting_primary = -1;
+  osdmap->pg_to_up_acting_osds(this->pgid, &up_osds, &up_primary, &acting_osds, &acting_primary);
+
+  // Call advance_map on ALL shards that have peering states, including failed ones
+  // This ensures that failed OSDs are notified of map changes (e.g., primary failover)
+  // Use the newly computed up_osds and acting_osds from the new OSDMap
+  for (auto& [shard, ps] : shard_peering_states) {
+    ps->advance_map(
+      osdmap, old_osdmap, up_osds, up_primary, acting_osds, acting_primary,
+      *get_peering_ctx(shard));
+  }
+
+  // Call activate_map on ALL shards that have peering states
+  // This ensures failed OSDs properly transition state and notify their backends
+  for (auto& [shard, ps] : shard_peering_states) {
+    ps->activate_map(*get_peering_ctx(shard));
+  }
+
+  dispatch_all();
+
+  // Handle up_thru requirements - keep creating new epochs until peering completes.
+  // Note: For primary failover scenarios, full peering may not complete immediately.
+  int max_iterations = 3;
+  do {
+    event_advance_map();
+    event_activate_map();
+  } while (new_epoch(true) && --max_iterations);
+}
+
+bool ECPeeringTestFixture::new_epoch(bool if_required)
+{
+  bool did_work = false;
+  epoch_t e = osdmap->get_epoch();
+  OSDMap::Incremental pending_inc(e + 1);
+  pending_inc.fsid = osdmap->get_fsid();
+
+  // Get acting set from OSDMap
+  std::vector<int> acting_osds;
+  int acting_primary = -1;
+  osdmap->pg_to_acting_osds(this->pgid, &acting_osds, &acting_primary);
+
+  for (int shard : acting_osds) {
+    // Skip failed OSDs (marked as CRUSH_ITEM_NONE)
+    if (shard == CRUSH_ITEM_NONE) {
+      continue;
+    }
+    if (get_peering_state(shard)->get_need_up_thru()) {
+      pending_inc.new_up_thru[shard] = e;
+      did_work = true;
+    }
+  }
+
+  if (acting_primary >= 0) {
+    auto& listener = shard_peering_listeners[acting_primary];
+    if (listener->pg_temp_wanted) {
+      // Get up set from OSDMap
+      std::vector<int> up_osds;
+      int up_primary = -1;
+      osdmap->pg_to_up_acting_osds(this->pgid, &up_osds, &up_primary, nullptr, nullptr);
+      
+      std::vector<int> acting_temp = listener->next_acting;
+      if (acting_temp.empty()) {
+        acting_temp = up_osds;
+      }
+      
+      // Apply the pg_temp change that peering requested.
+      // For EC pools with optimizations, transform to primaryfirst order
+      // (this simulates what the monitor does in production).
+      const pg_pool_t* pool = osdmap->get_pg_pool(this->pgid.pool());
+      std::vector<int> pg_temp_acting = acting_temp;
+      if (pool && pool->allows_ecoptimizations()) {
+        pg_temp_acting = osdmap->pgtemp_primaryfirst(*pool, acting_temp);
+      }
+      
+      pending_inc.new_pg_temp[this->pgid] =
+        mempool::osdmap::vector<int>(pg_temp_acting.begin(), pg_temp_acting.end());
+      
+      listener->pg_temp_wanted = false;
+      did_work = true;
+    }
+  }
+
+  if (!did_work && if_required) {
+    return false;
+  }
+
+  osdmap->apply_incremental(pending_inc);
+
+  for (auto& [shard, listener] : shard_peering_listeners) {
+    listener->current_epoch = osdmap->get_epoch();
+  }
+
+  return true;
+}
+
+void ECPeeringTestFixture::run_peering_cycle()
+{
+  init_peering();
+  event_initialize();
+  dispatch_all();
+  event_advance_map();
+  dispatch_all();
+  event_activate_map();
+  dispatch_all();
+
+  // Handle up_thru requirements - keep creating new epochs until peering completes.
+  int max_iterations = 10;
+  for (int i = 0; i < max_iterations && !all_shards_active(); i++) {
+    if (new_epoch(true)) {
+      event_advance_map();
+      dispatch_all();
+      event_activate_map();
+      dispatch_all();
+    }
+  }
+}
+
+int ECPeeringTestFixture::queue_transaction_helper(int shard, ObjectStore::Transaction&& t)
+{
+  if (t.empty()) {
+    return 0;
+  }
+
+  // Note: Contexts are stolen by MockPGBackendListener::queue_transaction,
+  // so we don't need to call execute_finishers here
+  int result = store->queue_transaction(chs[shard], std::move(t));
+
+  return result;
+}
+
+void ECPeeringTestFixture::mark_osd_down(int osd_id)
+{
+  // Create new OSDMap with the OSD marked as down
+  // This emulates what the real monitor does: just mark the OSD down,
+  // do NOT set pg_temp. Peering will detect the change and request pg_temp.
+  auto new_osdmap = std::make_shared<OSDMap>();
+  new_osdmap->deepish_copy_from(*osdmap);
+  OSDMapTestHelpers::mark_osd_down(new_osdmap, osd_id);
+  
+  update_osdmap_with_peering(new_osdmap);
+  dispatch_all();
+  
+  // Process any pg_temp requests from peering (emulates monitor processing MOSDPGTemp)
+  // This will apply the primaryfirst transformation if needed
+  if (new_epoch(false)) {
+    event_advance_map();
+    dispatch_all();
+  }
+}
+
+void ECPeeringTestFixture::mark_osd_up(int osd_id)
+{
+  // Create new OSDMap with the OSD marked as up using OSDMapTestHelpers
+  auto new_osdmap = std::make_shared<OSDMap>();
+  new_osdmap->deepish_copy_from(*osdmap);
+  OSDMapTestHelpers::mark_osd_up(new_osdmap, osd_id);
+  
+  update_osdmap_with_peering(new_osdmap);
+  dispatch_all();
+}
+
+void ECPeeringTestFixture::mark_osds_down(const std::vector<int>& osd_ids)
+{
+  // Create new OSDMap with all OSDs marked as down using OSDMapTestHelpers
+  auto new_osdmap = std::make_shared<OSDMap>();
+  new_osdmap->deepish_copy_from(*osdmap);
+  OSDMapTestHelpers::mark_osds_down(new_osdmap, osd_ids);
+  
+  update_osdmap_with_peering(new_osdmap);
+  dispatch_all();
+}
+
+void ECPeeringTestFixture::advance_epoch()
+{
+  auto new_osdmap = std::make_shared<OSDMap>();
+  new_osdmap->deepish_copy_from(*osdmap);
+  OSDMapTestHelpers::advance_epoch(new_osdmap);
+  
+  update_osdmap_with_peering(new_osdmap);
+  dispatch_all();
+}
+
diff --git a/src/test/osd/ECPeeringTestFixture.h b/src/test/osd/ECPeeringTestFixture.h

new file mode 100644 (file)

index 0000000..5c33603
--- /dev/null
+++ b/src/test/osd/ECPeeringTestFixture.h
@@ -0,0 +1,499 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 sts=2 expandtab
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2026 IBM
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include <memory>
+#include <map>
+#include <vector>
+#include "test/osd/PGBackendTestFixture.h"
+#include "test/osd/MockPeeringListener.h"
+#include "test/osd/MockConnection.h"
+#include "test/osd/MockECRecPred.h"
+#include "test/osd/MockECReadPred.h"
+#include "test/osd/OSDMapTestHelpers.h"
+#include "osd/PeeringState.h"
+#include "messages/MOSDPeeringOp.h"
+
+/**
+ * ECPeeringTestFixture - EC test fixture with full peering infrastructure
+ *
+ * This fixture extends PGBackendTestFixture to add full PeeringState support
+ * for each shard, enabling comprehensive testing of EC peering, recovery,
+ * and failover scenarios. It combines the principles from TestPeeringState
+ * with the EC backend infrastructure from PGBackendTestFixture.
+ */
+class ECPeeringTestFixture : public PGBackendTestFixture {
+protected:
+  std::map<int, std::unique_ptr<PeeringState>> shard_peering_states;
+  std::map<int, std::unique_ptr<PeeringCtx>> shard_peering_ctxs;
+  std::map<int, std::unique_ptr<MockPeeringListener>> shard_peering_listeners;
+  
+  std::map<int, std::list<MessageRef>> shard_messages;
+  std::map<int, std::list<PGPeeringEventRef>> shard_events;
+
+  // Raw-pointer map giving this fixture direct access to the backend listeners
+  // created by the listener_factory.  The pointers are valid for the lifetime
+  // of the test because ownership is transferred to
+  // shard_peering_listeners[i]->backend_listener in create_peering_state().
+  std::map<int, MockPGBackendListener*> backend_listeners;
+  
+  class ShardDpp : public NoDoutPrefix {
+  public:
+    ECPeeringTestFixture *fixture;
+    int shard;
+    
+    ShardDpp(CephContext *cct, ECPeeringTestFixture *f, int s)
+      : NoDoutPrefix(cct, ceph_subsys_osd), fixture(f), shard(s) {}
+    
+    std::ostream& gen_prefix(std::ostream& out) const override {
+      out << "shard " << shard << ": ";
+      if (fixture->shard_peering_states.contains(shard)) {
+        PeeringState *ps = fixture->shard_peering_states[shard].get();
+        out << *ps << " ";
+      }
+      return out;
+    }
+  };
+  std::map<int, std::unique_ptr<ShardDpp>> shard_dpps;
+  
+  IsPGRecoverablePredicate *get_is_recoverable_predicate() {
+    return new MockECRecPred(k, m);
+  }
+  
+  IsPGReadablePredicate *get_is_readable_predicate() {
+    return new MockECReadPred(k, m);
+  }
+
+public:
+  ECPeeringTestFixture() : PGBackendTestFixture(PGBackendTestFixture::EC) {
+    // Install a listener_factory so that setup_ec_pool() creates listeners
+    // that we can access directly (via backend_listeners[]) without needing
+    // to steal ownership via release_listener().
+    //
+    // The factory records a raw pointer in backend_listeners[instance] and
+    // returns the unique_ptr to the base class, which stores it in listeners[].
+    // In create_peering_state() we then move that unique_ptr from listeners[]
+    // into shard_peering_listeners[]->backend_listener, at which point the
+    // raw pointer in backend_listeners[] remains valid (owned by the peering
+    // listener for the rest of the test).
+    listener_factory = [this](
+      int instance,
+      std::shared_ptr<OSDMap> om,
+      int64_t pool_id,
+      DoutPrefixProvider* dpp_arg,
+      pg_shard_t whoami) -> std::unique_ptr<MockPGBackendListener>
+    {
+      auto bl = std::make_unique<MockPGBackendListener>(
+        om, pool_id, dpp_arg, whoami);
+      // Record raw pointer so tests can access the listener directly
+      backend_listeners[instance] = bl.get();
+      return bl;
+    };
+  }
+  
+  void SetUp() override {
+    PGBackendTestFixture::SetUp();
+    for (int i = 0; i < k + m; i++) {
+      create_peering_state(i);
+    }
+  }
+  
+  void TearDown() override {
+    shard_peering_states.clear();
+    shard_peering_ctxs.clear();
+    shard_peering_listeners.clear();
+    shard_dpps.clear();
+    shard_messages.clear();
+    shard_events.clear();
+    PGBackendTestFixture::TearDown();
+  }
+  
+  PeeringState* create_peering_state(int shard);
+  
+  PeeringState* get_peering_state(int shard) {
+    ceph_assert(shard >= 0 && shard < k + m);
+    auto it = shard_peering_states.find(shard);
+    ceph_assert(it != shard_peering_states.end());
+    ceph_assert(it->second != nullptr);
+    return it->second.get();
+  }
+  
+  PeeringCtx* get_peering_ctx(int shard) {
+    ceph_assert(shard >= 0 && shard < k + m);
+    auto it = shard_peering_ctxs.find(shard);
+    ceph_assert(it != shard_peering_ctxs.end());
+    ceph_assert(it->second != nullptr);
+    return it->second.get();
+  }
+  
+  MockPeeringListener* get_peering_listener(int shard) {
+    ceph_assert(shard >= 0 && shard < k + m);
+    auto it = shard_peering_listeners.find(shard);
+    ceph_assert(it != shard_peering_listeners.end());
+    ceph_assert(it->second != nullptr);
+    return it->second.get();
+  }
+  
+  /**
+   * Query the OSDMap to determine which shard is the primary.
+   * This is the authoritative source of truth for primary determination.
+   *
+   * @return The shard ID of the primary, or -1 if no primary exists
+   */
+  int get_primary_shard_from_osdmap() const {
+    std::vector<int> acting_osds;
+    int acting_primary = -1;
+    osdmap->pg_to_acting_osds(this->pgid, &acting_osds, &acting_primary);
+    return acting_primary;
+  }
+  
+  // Override base class methods to work with peering fixture's structure
+  MockPGBackendListener* get_primary_listener() override {
+    int primary_shard = get_primary_shard_from_osdmap();
+    if (primary_shard < 0) {
+      return nullptr;
+    }
+    
+    auto it = shard_peering_listeners.find(primary_shard);
+    if (it != shard_peering_listeners.end() && it->second &&
+        it->second->backend_listener) {
+      // Assert that the backend listener agrees it's primary
+      ceph_assert(it->second->backend_listener->pgb_is_primary());
+      return it->second->backend_listener.get();
+    }
+    return nullptr;
+  }
+  
+  PGBackend* get_primary_backend() override {
+    int primary_shard = get_primary_shard_from_osdmap();
+    if (primary_shard < 0) {
+      return nullptr;
+    }
+    
+    auto listener_it = shard_peering_listeners.find(primary_shard);
+    if (listener_it != shard_peering_listeners.end() && listener_it->second &&
+        listener_it->second->backend_listener) {
+      // Assert that the backend listener agrees it's primary
+      ceph_assert(listener_it->second->backend_listener->pgb_is_primary());
+      
+      // Return the backend from the base class's backends map, not from
+      // the peering listener, because the base class backend is connected
+      // to the event loop and message routers
+      auto backend_it = backends.find(primary_shard);
+      return (backend_it != backends.end()) ? backend_it->second.get() : nullptr;
+    }
+    return nullptr;
+  }
+  
+  void init_peering(bool dne = false);
+  
+  void event_initialize() {
+    // Get acting set from OSDMap
+    std::vector<int> acting_osds;
+    int acting_primary = -1;
+    osdmap->pg_to_acting_osds(this->pgid, &acting_osds, &acting_primary);
+    
+    for (int shard : acting_osds) {
+      // Skip failed OSDs (marked as CRUSH_ITEM_NONE)
+      if (shard == CRUSH_ITEM_NONE) {
+        continue;
+      }
+      auto evt = std::make_shared<PGPeeringEvent>(
+        osdmap->get_epoch(),
+        osdmap->get_epoch(),
+        PeeringState::Initialize());
+      
+      get_peering_state(shard)->handle_event(evt, get_peering_ctx(shard));
+    }
+  }
+  
+  void event_advance_map() {
+    // Get primary from OSDMap - query once before the loop
+    std::vector<int> up_osds, acting_osds;
+    int up_primary = -1, acting_primary = -1;
+    osdmap->pg_to_up_acting_osds(this->pgid, &up_osds, &up_primary, &acting_osds, &acting_primary);
+    
+    for (int shard : acting_osds) {
+      // Skip failed OSDs (marked as CRUSH_ITEM_NONE)
+      if (shard == CRUSH_ITEM_NONE) {
+        continue;
+      }
+      get_peering_state(shard)->advance_map(
+        osdmap, osdmap, up_osds, up_primary, acting_osds, acting_primary,
+        *get_peering_ctx(shard));
+    }
+  }
+  
+  void event_activate_map() {
+    // Get acting set from OSDMap - must use same set as advance_map
+    std::vector<int> up_osds, acting_osds;
+    int up_primary = -1, acting_primary = -1;
+    osdmap->pg_to_up_acting_osds(this->pgid, &up_osds, &up_primary, &acting_osds, &acting_primary);
+    
+    for (int shard : acting_osds) {
+      // Skip failed OSDs (marked as CRUSH_ITEM_NONE)
+      if (shard == CRUSH_ITEM_NONE) {
+        continue;
+      }
+      get_peering_state(shard)->activate_map(*get_peering_ctx(shard));
+    }
+  }
+  
+private:
+  // Dispatch all messages from a map<int, Container<MessageRef>>.
+  // Templated to work with both std::vector (PeeringCtx::message_map) and
+  // std::list (MockPeeringListener::messages).
+  template <typename Container>
+  bool dispatch_messages_from_map(int from_shard,
+                                  std::map<int, Container>& msg_map) {
+    bool did_work = false;
+
+    // Get acting set from OSDMap
+    std::vector<int> acting_osds;
+    int acting_primary = -1;
+    osdmap->pg_to_acting_osds(this->pgid, &acting_osds, &acting_primary);
+
+    for (auto& [to_shard, msg_list] : msg_map) {
+      if (std::find(acting_osds.begin(), acting_osds.end(), to_shard) == acting_osds.end()) {
+        continue;
+      }
+
+      while (!msg_list.empty()) {
+        MessageRef m = msg_list.front();
+        msg_list.erase(msg_list.begin());
+
+        // Cast to MOSDPeeringOp - all peering messages inherit from this.
+        // Use dynamic_cast with assertion to catch unexpected message types.
+        // Use m.get() (not m.detach()) to avoid leaking the raw pointer.
+        MOSDPeeringOp *op = dynamic_cast<MOSDPeeringOp*>(m.get());
+        ceph_assert(op != nullptr) /* message must be a MOSDPeeringOp */;
+
+        // Set connection peer to the SENDER, not the destination
+        ceph_msg_header h = op->get_header();
+        h.src.num = from_shard;
+        op->set_header(h);
+
+        ConnectionRef conn = new MockConnection(from_shard);
+        op->set_connection(conn);
+
+        // get_event() returns a newly allocated PGPeeringEvent,
+        // so we take ownership directly into a shared_ptr (matching OSD.cc pattern)
+        PGPeeringEventRef evt_ref(op->get_event());
+
+        get_peering_state(to_shard)->handle_event(
+          evt_ref,
+          get_peering_ctx(to_shard));
+
+        did_work = true;
+      }
+    }
+
+    return did_work;
+  }
+
+public:
+  bool dispatch_peering_messages(int from_shard) {
+    auto* ctx = get_peering_ctx(from_shard);
+    return dispatch_messages_from_map(from_shard, ctx->message_map);
+  }
+
+  bool dispatch_cluster_messages(int from_shard) {
+    auto& listener = shard_peering_listeners[from_shard];
+    return dispatch_messages_from_map(from_shard, listener->messages);
+  }
+  
+  bool dispatch_all_peering_messages() {
+    bool did_work = false;
+    bool work_this_round;
+    
+    // Get acting set from OSDMap
+    std::vector<int> acting_osds;
+    int acting_primary = -1;
+    osdmap->pg_to_acting_osds(this->pgid, &acting_osds, &acting_primary);
+    
+    do {
+      work_this_round = false;
+      for (int shard : acting_osds) {
+        // Skip failed OSDs (marked as CRUSH_ITEM_NONE)
+        if (shard == CRUSH_ITEM_NONE) {
+          continue;
+        }
+        work_this_round |= dispatch_peering_messages(shard);
+      }
+      did_work |= work_this_round;
+    } while (work_this_round);
+    
+    return did_work;
+  }
+  
+  bool dispatch_events(int shard, bool stalled = false) {
+    auto& listener = shard_peering_listeners[shard];
+    std::list<PGPeeringEventRef>& event_queue = 
+      stalled ? listener->stalled_events : listener->events;
+    
+    if (event_queue.empty()) {
+      return false;
+    }
+    
+    bool did_work = false;
+    while (!event_queue.empty()) {
+      PGPeeringEventRef evt = event_queue.front();
+      event_queue.pop_front();
+      
+      get_peering_state(shard)->handle_event(evt, get_peering_ctx(shard));
+      did_work = true;
+    }
+    
+    return did_work;
+  }
+  
+  bool dispatch_all_events(bool stalled = false) {
+    bool did_work = false;
+    bool work_this_round;
+    
+    // Get acting set from OSDMap
+    std::vector<int> acting_osds;
+    int acting_primary = -1;
+    osdmap->pg_to_acting_osds(this->pgid, &acting_osds, &acting_primary);
+    
+    do {
+      work_this_round = false;
+      for (int shard : acting_osds) {
+        // Skip failed OSDs (marked as CRUSH_ITEM_NONE)
+        if (shard == CRUSH_ITEM_NONE) {
+          continue;
+        }
+        work_this_round |= dispatch_events(shard, stalled);
+      }
+      did_work |= work_this_round;
+    } while (work_this_round);
+    
+    return did_work;
+  }
+  
+  bool dispatch_all_cluster_messages() {
+    bool did_work = false;
+    bool work_this_round;
+    
+    // Get acting set from OSDMap
+    std::vector<int> acting_osds;
+    int acting_primary = -1;
+    osdmap->pg_to_acting_osds(this->pgid, &acting_osds, &acting_primary);
+    
+    do {
+      work_this_round = false;
+      for (int shard : acting_osds) {
+        // Skip failed OSDs (marked as CRUSH_ITEM_NONE)
+        if (shard == CRUSH_ITEM_NONE) {
+          continue;
+        }
+        work_this_round |= dispatch_cluster_messages(shard);
+      }
+      did_work |= work_this_round;
+    } while (work_this_round);
+    
+    return did_work;
+  }
+  
+  bool dispatch_all() {
+    bool did_work = false;
+    bool work_this_round;
+    
+    do {
+      work_this_round = false;
+      work_this_round |= dispatch_all_peering_messages();
+      work_this_round |= dispatch_all_cluster_messages();
+      work_this_round |= dispatch_all_events();
+      did_work |= work_this_round;
+    } while (work_this_round);
+    
+    return did_work;
+  }
+  
+  // IMPORTANT: For EC pools, shard positions in acting array must be preserved.
+  // Failed OSDs should be replaced with CRUSH_ITEM_NONE, not removed.
+  void update_osdmap_with_peering(
+    std::shared_ptr<OSDMap> new_osdmap,
+    std::optional<pg_shard_t> new_primary = std::nullopt);
+
+  bool new_epoch(bool if_required = false);
+
+  int queue_transaction_helper(int shard, ObjectStore::Transaction&& t);
+
+  void run_peering_cycle();
+  
+  // OSDMap manipulation helpers - these create a new epoch and trigger peering
+  
+  /**
+   * Mark an OSD as down (exists but not UP).
+   * Creates a new OSDMap epoch and triggers peering.
+   */
+  void mark_osd_down(int osd_id);
+  
+  /**
+   * Mark an OSD as up.
+   * Creates a new OSDMap epoch and triggers peering.
+   */
+  void mark_osd_up(int osd_id);
+  
+  /**
+   * Mark multiple OSDs as down.
+   * Creates a new OSDMap epoch and triggers peering.
+   */
+  void mark_osds_down(const std::vector<int>& osd_ids);
+  
+  /**
+   * Advance to a new epoch without changing OSD states.
+   * Useful for testing re-peering scenarios.
+   */
+  void advance_epoch();
+  
+  bool all_shards_active() {
+    // Get acting set from OSDMap
+    std::vector<int> acting_osds;
+    int acting_primary = -1;
+    osdmap->pg_to_acting_osds(this->pgid, &acting_osds, &acting_primary);
+    
+    for (int shard : acting_osds) {
+      // Skip failed OSDs (marked as CRUSH_ITEM_NONE)
+      if (shard == CRUSH_ITEM_NONE) {
+        continue;
+      }
+      if (!get_peering_state(shard)->is_active()) {
+        return false;
+      }
+    }
+    return true;
+  }
+  
+  // In EC pools, only the primary tracks PG_STATE_CLEAN.
+  bool all_shards_clean() {
+    // Get primary from OSDMap
+    std::vector<int> acting_osds;
+    int acting_primary = -1;
+    osdmap->pg_to_acting_osds(this->pgid, &acting_osds, &acting_primary);
+    
+    if (acting_primary >= 0 && acting_primary != CRUSH_ITEM_NONE) {
+      return get_peering_state(acting_primary)->is_clean();
+    }
+    return false;
+  }
+  
+  std::string get_state_name(int shard) {
+    return get_peering_state(shard)->get_current_state();
+  }
+};
+
diff --git a/src/test/osd/EventLoop.h b/src/test/osd/EventLoop.h

new file mode 100644 (file)

index 0000000..1167b82
--- /dev/null
+++ b/src/test/osd/EventLoop.h
@@ -0,0 +1,265 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 sts=2 expandtab
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2026 IBM
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include <iostream>
+#include <functional>
+#include <queue>
+#include <map>
+#include <list>
+#include <vector>
+#include "include/types.h"
+#include "messages/MOSDOp.h"
+#include "osd/OpRequest.h"
+#include "osd/PeeringState.h"
+#include "os/ObjectStore.h"
+
+/**
+ * EventLoop - Unified single-threaded event loop for OSD tests.
+ *
+ * Combines EC backend messages, ObjectStore transactions, peering messages,
+ * and peering events into a single deterministic queue.  This allows tests
+ * to properly interleave peering state changes with EC backend operations.
+ */
+class EventLoop {
+public:
+  using GenericEvent = std::function<void()>;
+  
+  enum class EventType {
+    GENERIC,
+    OSD_MESSAGE,
+    TRANSACTION,
+    PEERING_MESSAGE,
+    CLUSTER_MESSAGE,
+    PEERING_EVENT
+  };
+  
+private:
+  struct Event {
+    EventType type;
+    int osd;  // -1 for generic events
+    GenericEvent callback;
+    
+    Event(EventType t, int o, GenericEvent cb)
+      : type(t), osd(o), callback(std::move(cb)) {}
+  };
+  
+  std::queue<Event> event_queue;
+  bool verbose = false;
+  int events_executed = 0;
+  std::map<EventType, int> events_by_type;
+  
+  static constexpr const char* event_type_name(EventType type) {
+    switch (type) {
+      case EventType::GENERIC: return "GENERIC";
+      case EventType::OSD_MESSAGE: return "OSD_MESSAGE";
+      case EventType::TRANSACTION: return "TRANSACTION";
+      case EventType::PEERING_MESSAGE: return "PEERING_MESSAGE";
+      case EventType::CLUSTER_MESSAGE: return "CLUSTER_MESSAGE";
+      case EventType::PEERING_EVENT: return "PEERING_EVENT";
+      default: return "UNKNOWN";
+    }
+  }
+  
+public:
+  EventLoop(bool verbose = false) : verbose(verbose) {}
+  
+  void schedule_generic(GenericEvent event) {
+    event_queue.emplace(EventType::GENERIC, -1, std::move(event));
+  }
+  
+  void schedule_osd_message(int osd, GenericEvent callback) {
+    event_queue.emplace(EventType::OSD_MESSAGE, osd, std::move(callback));
+  }
+  
+  void schedule_transaction(int osd, GenericEvent callback) {
+    event_queue.emplace(EventType::TRANSACTION, osd, std::move(callback));
+  }
+  
+  void schedule_peering_message(int to_osd, GenericEvent callback) {
+    event_queue.emplace(EventType::PEERING_MESSAGE, to_osd, std::move(callback));
+  }
+  
+  void schedule_cluster_message(int to_osd, GenericEvent callback) {
+    event_queue.emplace(EventType::CLUSTER_MESSAGE, to_osd, std::move(callback));
+  }
+  
+  void schedule_peering_event(int osd, GenericEvent callback) {
+    event_queue.emplace(EventType::PEERING_EVENT, osd, std::move(callback));
+  }
+  
+  bool has_events() const {
+    return !event_queue.empty();
+  }
+  
+  size_t queued_event_count() const {
+    return event_queue.size();
+  }
+  
+  int get_events_executed() const {
+    return events_executed;
+  }
+  
+  const std::map<EventType, int>& get_stats_by_type() const {
+    return events_by_type;
+  }
+  
+  void reset_stats() {
+    events_executed = 0;
+    events_by_type.clear();
+  }
+  
+  bool run_one() {
+    if (event_queue.empty()) {
+      return false;
+    }
+    
+    Event event = std::move(event_queue.front());
+    event_queue.pop();
+    
+    if (verbose) {
+      std::cout << "  [Event " << (events_executed + 1) << "] "
+                << event_type_name(event.type);
+      if (event.osd >= 0) {
+        std::cout << " (OSD " << event.osd << ")";
+      }
+      std::cout << " Executing..." << std::endl;
+    }
+    
+    event.callback();
+    events_executed++;
+    events_by_type[event.type]++;
+    
+    return true;
+  }
+  
+  int run_many(int count) {
+    if (verbose) {
+      std::cout << "\n=== Running " << count << " events ===" << std::endl;
+    }
+    
+    int executed = 0;
+    for (int i = 0; i < count && run_one(); i++) {
+      executed++;
+    }
+    
+    if (verbose) {
+      std::cout << "=== Executed " << executed << " events, " 
+                << event_queue.size() << " remaining ===" << std::endl;
+    }
+    
+    return executed;
+  }
+  
+  /**
+   * Run until the queue is empty or max_events is reached.
+   * Returns -1 if max_events was reached before the queue emptied.
+   */
+  int run_until_idle(int max_events = 0) {
+    if (verbose) {
+      std::cout << "\n=== Running until idle";
+      if (max_events > 0) {
+        std::cout << " (max " << max_events << " events)";
+      }
+      std::cout << " ===" << std::endl;
+    }
+    
+    int executed = 0;
+    while (has_events()) {
+      if (max_events > 0 && executed >= max_events) {
+        if (verbose) {
+          std::cout << "=== Max events (" << max_events << ") reached, " 
+                    << event_queue.size() << " events remaining ===" << std::endl;
+        }
+        return -1;  // Timeout
+      }
+      
+      run_one();
+      executed++;
+    }
+    
+    if (verbose) {
+      std::cout << "=== Idle: Executed " << executed << " events ===" << std::endl;
+      print_stats();
+    }
+    
+    return executed;
+  }
+  
+  /**
+   * Run until a condition is met, idle, or max_events is reached.
+   * The condition is checked after each event execution.
+   * Returns -1 if max_events was reached.
+   */
+  int run_until(int max_events, std::function<bool()> condition) {
+    if (verbose) {
+      std::cout << "\n=== Running until condition";
+      if (max_events > 0) {
+        std::cout << " (max " << max_events << " events)";
+      }
+      std::cout << " ===" << std::endl;
+    }
+    
+    int executed = 0;
+    while (has_events()) {
+      if (max_events > 0 && executed >= max_events) {
+        if (verbose) {
+          std::cout << "=== Max events (" << max_events << ") reached ===" << std::endl;
+        }
+        return -1;  // Timeout
+      }
+      
+      run_one();
+      executed++;
+      
+      if (condition()) {
+        if (verbose) {
+          std::cout << "=== Condition met after " << executed << " events ===" << std::endl;
+        }
+        return executed;
+      }
+    }
+    
+    if (verbose) {
+      std::cout << "=== Idle: Executed " << executed << " events, condition not met ===" << std::endl;
+    }
+    
+    return executed;
+  }
+  
+  void clear() {
+    while (!event_queue.empty()) {
+      event_queue.pop();
+    }
+  }
+  
+  void set_verbose(bool v) {
+    verbose = v;
+  }
+  
+  void print_stats() const {
+    if (events_by_type.empty()) {
+      return;
+    }
+    
+    std::cout << "=== Event Statistics ===" << std::endl;
+    for (const auto& [type, count] : events_by_type) {
+      std::cout << "  " << event_type_name(type) << ": " << count << std::endl;
+    }
+    std::cout << "  TOTAL: " << events_executed << std::endl;
+  }
+};
+
diff --git a/src/test/osd/MockConnection.h b/src/test/osd/MockConnection.h

index a0785c217ac139a0e8f3fd0b3b21118d522ffd5d..293bf589a3e56f99414dd8b7b13df7647e7de372 100644 (file)
--- a/src/test/osd/MockConnection.h
+++ b/src/test/osd/MockConnection.h
@@ -21,11 +21,18 @@
  //MockConnection - simple stub. Required because PeeringState needs
  //to know the features of the peer OSD which sent a peering message
  class MockConnection : public Connection {
+ private:
+  int peer_osd;
+  
   public:
-  MockConnection() : Connection(g_ceph_context, nullptr) {
+  MockConnection(int peer = -1) : Connection(g_ceph_context, nullptr), peer_osd(peer) {
      set_features(CEPH_FEATURES_ALL);
    }
  
+  int get_peer_osd() const {
+    return peer_osd;
+  }
+
    bool is_connected() override {
      return true;
    }
@@ -48,4 +55,3 @@ class MockConnection : public Connection {
      return entity_addr_t();
    }
  };
-
diff --git a/src/test/osd/MockECReadPred.h b/src/test/osd/MockECReadPred.h

index cff7b353f16653acab5a1455b6df4ea1ed181875..2e7843d66ebe3482f35b7be4654b771541d033af 100644 (file)
--- a/src/test/osd/MockECReadPred.h
+++ b/src/test/osd/MockECReadPred.h
@@ -17,14 +17,41 @@
  #include <set>
  #include "osd/PGBackend.h"
  
-// MockECReadPred - simple stub for IsPGReadablePredicate
-// Warning - this always returns true. This means we cannot test scenarios
-// where there are too many OSDs down and the PG should be incomplete
+/**
+ * MockECReadPred - configurable stub for IsPGReadablePredicate.
+ *
+ * When constructed with default arguments (k=0, m=0), always returns true
+ * (original behaviour, suitable for basic tests that don't need quorum
+ * checking).
+ *
+ * When constructed with real k and m values, implements proper quorum
+ * checking: the PG is readable if at least k shards are available (i.e.
+ * we have enough data shards to reconstruct the object without needing
+ * any coding shards).
+ *
+ * This enables negative testing of scenarios where too many OSDs are down
+ * and the PG should be unreadable.
+ */
  class MockECReadPred : public IsPGReadablePredicate {
   public:
-  MockECReadPred() {}
-  bool operator()(const std::set<pg_shard_t> &_have) const override {
-    return true;
+  /**
+   * @param k  Number of data chunks (0 = always-true mode)
+   * @param m  Number of coding chunks (unused in read predicate, kept for
+   *           symmetry with MockECRecPred)
+   */
+  explicit MockECReadPred(int k = 0, int m = 0) : k(k), m(m) {}
+
+  bool operator()(const std::set<pg_shard_t> &have) const override {
+    // When k==0 fall back to always-true (backward-compatible default)
+    if (k == 0) {
+      return true;
+    }
+    // Readable when we have at least k shards available
+    return static_cast<int>(have.size()) >= k;
    }
+
+ private:
+  int k;
+  int m;
  };
  
diff --git a/src/test/osd/MockECRecPred.h b/src/test/osd/MockECRecPred.h

index 1b603350190df3b553131173678efd538e9734ab..64b1d48403dcc7516fad8b84c30b0099382ebfbd 100644 (file)
--- a/src/test/osd/MockECRecPred.h
+++ b/src/test/osd/MockECRecPred.h
@@ -17,15 +17,40 @@
  #include <set>
  #include "osd/PGBackend.h"
  
-// MockECRecPred - simple stub for IsPGRecoverablePredicate
-// Warning - this always returns true. This means we cannot test scenarios
-// where there are too many OSDs down and the PG should be incomplete
+/**
+ * MockECRecPred - configurable stub for IsPGRecoverablePredicate.
+ *
+ * When constructed with default arguments (k=0, m=0), always returns true
+ * (original behaviour, suitable for basic tests that don't need quorum
+ * checking).
+ *
+ * When constructed with real k and m values, implements proper quorum
+ * checking: the PG is recoverable if at least k shards are available (i.e.
+ * we have enough shards to reconstruct all data, since any k-of-(k+m) EC
+ * scheme can recover from up to m failures).
+ *
+ * This enables negative testing of scenarios where too many OSDs are down
+ * and the PG should be marked Incomplete.
+ */
  class MockECRecPred : public IsPGRecoverablePredicate {
   public:
-  MockECRecPred() {}
+  /**
+   * @param k  Number of data chunks (0 = always-true mode)
+   * @param m  Number of coding chunks (0 = always-true mode)
+   */
+  explicit MockECRecPred(int k = 0, int m = 0) : k(k), m(m) {}
  
-  bool operator()(const std::set<pg_shard_t> &_have) const override {
-    return true;
+  bool operator()(const std::set<pg_shard_t> &have) const override {
+    // When k==0 fall back to always-true (backward-compatible default)
+    if (k == 0) {
+      return true;
+    }
+    // Recoverable when we have at least k shards (can tolerate up to m failures)
+    return static_cast<int>(have.size()) >= k;
    }
+
+ private:
+  int k;
+  int m;
  };
  
diff --git a/src/test/osd/MockLog.h b/src/test/osd/MockLog.h

deleted file mode 100644 (file)

index e3ee9cf..0000000
--- a/src/test/osd/MockLog.h
+++ /dev/null
@@ -1,107 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
-// vim: ts=8 sw=2 sts=2 expandtab
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2026 IBM
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation.  See file COPYING.
- *
- */
-
-#pragma once
-
-#include <iostream>
-#include <sstream>
-#include <string>
-#include "common/ostream_temp.h"
-
-//MockLog - simple stub
-class MockLog : public LoggerSinkSet {
- public:
-  void debug(std::stringstream& s) final
-  {
-    std::cout << "\n<<debug>> " << s.str() << std::endl;
-  }
-
-  void info(std::stringstream& s) final
-  {
-    std::cout << "\n<<info>> " << s.str() << std::endl;
-  }
-
-  void sec(std::stringstream& s) final
-  {
-    std::cout << "\n<<sec>> " << s.str() << std::endl;
-  }
-
-  void warn(std::stringstream& s) final
-  {
-    std::cout << "\n<<warn>> " << s.str() << std::endl;
-  }
-
-  void error(std::stringstream& s) final
-  {
-    err_count++;
-    std::cout << "\n<<error>> " << s.str() << std::endl;
-  }
-
-  OstreamTemp info() final { return OstreamTemp(CLOG_INFO, this); }
-  OstreamTemp warn() final { return OstreamTemp(CLOG_WARN, this); }
-  OstreamTemp error() final { return OstreamTemp(CLOG_ERROR, this); }
-  OstreamTemp sec() final { return OstreamTemp(CLOG_ERROR, this); }
-  OstreamTemp debug() final { return OstreamTemp(CLOG_DEBUG, this); }
-
-  void do_log(clog_type prio, std::stringstream& ss) final
-  {
-    switch (prio) {
-      case CLOG_DEBUG:
-        debug(ss);
-        break;
-      case CLOG_INFO:
-        info(ss);
-        break;
-      case CLOG_SEC:
-        sec(ss);
-        break;
-      case CLOG_WARN:
-        warn(ss);
-        break;
-      case CLOG_ERROR:
-      default:
-        error(ss);
-        break;
-    }
-  }
-
-  void do_log(clog_type prio, const std::string& ss) final
-  {
-    switch (prio) {
-      case CLOG_DEBUG:
-        debug() << ss;
-        break;
-      case CLOG_INFO:
-        info() << ss;
-        break;
-      case CLOG_SEC:
-        sec() << ss;
-        break;
-      case CLOG_WARN:
-        warn() << ss;
-        break;
-      case CLOG_ERROR:
-      default:
-        error() << ss;
-        break;
-    }
-  }
-
-  virtual ~MockLog() {}
-
-  int err_count{0};
-  int expected_err_count{0};
-  void set_expected_err_count(int c) { expected_err_count = c; }
-};
-
diff --git a/src/test/osd/MockPGBackend.h b/src/test/osd/MockPGBackend.h

index 5be8a218664a498131794a305f365bc2988282ef..a5834bd766d92c5e2b572c235b2669653ad6f8a6 100644 (file)
--- a/src/test/osd/MockPGBackend.h
+++ b/src/test/osd/MockPGBackend.h
@@ -14,12 +14,7 @@
  
  #pragma once
  
-#include <functional>
-#include <list>
-#include <optional>
-#include <vector>
  #include "osd/PGBackend.h"
-#include "osd/ECUtil.h"
  #include "os/ObjectStore.h"
  
  // MockPGBackend - simple stub for PGBackend
diff --git a/src/test/osd/MockPGBackendListener.h b/src/test/osd/MockPGBackendListener.h

index 219e5298b61639d9e8553686b4dbd87a613992d1..35049f4c0832183bef5b52d8aef720c5f86f830a 100644 (file)
--- a/src/test/osd/MockPGBackendListener.h
+++ b/src/test/osd/MockPGBackendListener.h
@@ -14,35 +14,95 @@
  
  #pragma once
  
+#include <functional>
+#include <vector>
  #include <map>
-#include <set>
-#include <optional>
  #include "osd/PGBackend.h"
+#include "osd/ECBackend.h"
+#include "osd/PGLog.h"
  #include "osd/OSDMap.h"
  #include "osd/osd_types.h"
-#include "osd/PGLog.h"
-#include "common/intrusive_timer.h"
-#include "common/ostream_temp.h"
-#include "global/global_context.h"
+#include "osd/osd_perf_counters.h"
+#include "osd/PeeringState.h"
+#include "common/ceph_context.h"
+#include "common/TrackedOp.h"
+#include "common/perf_counters.h"
+#include "messages/MOSDPGPush.h"
  #include "os/ObjectStore.h"
+#include "global/global_context.h"
+#include "test/osd/MockConnection.h"
+#include "test/osd/EventLoop.h"
+#include "osd/OpRequest.h"
  
-// MockPGBackendListener - simple stub for PGBackend::Listener
-class MockPGBackendListener : public PGBackend::Listener {
+// MockPGBackendListener - mock PGBackend::Listener and ECListener for multi-instance testing.
+class MockPGBackendListener : public PGBackend::Listener, public ECListener {
  public:
    pg_info_t info;
    OSDMapRef osdmap;
-  const pg_pool_t pool;
+  int64_t pool_id;
    PGLog log;
    DoutPrefixProvider *dpp;
    pg_shard_t pg_whoami;
    std::set<pg_shard_t> shardset;
+  
+  // Pointer to PeeringState for tests that use full peering
+  PeeringState *peering_state = nullptr;
+  
+  shard_id_set acting_recovery_backfill_shard_id_set;
    std::map<pg_shard_t, pg_info_t> shard_info;
    std::map<pg_shard_t, pg_missing_t> shard_missing;
    std::map<hobject_t, std::set<pg_shard_t>> missing_loc_shards;
    pg_missing_tracker_t local_missing;
-
-  MockPGBackendListener(OSDMapRef osdmap, const pg_pool_t pi, DoutPrefixProvider *dpp, pg_shard_t pg_whoami) :
-    osdmap(osdmap), pool(pi), log(g_ceph_context), dpp(dpp), pg_whoami(pg_whoami) {}
+  
+  std::vector<MessageRef> sent_messages;
+  std::vector<std::pair<int, MessageRef>> sent_messages_with_dest;
+  
+  ObjectStore *store = nullptr;
+  ObjectStore::CollectionHandle ch;
+  EventLoop *event_loop = nullptr;
+  std::function<bool(OpRequestRef)> handle_message_callback;
+  std::map<int, std::function<bool(OpRequestRef)>> *message_router = nullptr;
+  OpTracker *op_tracker = nullptr;
+  PerfCounters *perf_logger = nullptr;
+
+  MockPGBackendListener(OSDMapRef osdmap, int64_t pool_id, DoutPrefixProvider *dpp, pg_shard_t pg_whoami, PeeringState *ps = nullptr) :
+    osdmap(osdmap), pool_id(pool_id), log(g_ceph_context), dpp(dpp), pg_whoami(pg_whoami), peering_state(ps) {
+    // Create a full OSD PerfCounters using the standard build_osd_logger function.
+    // This prevents null pointer dereferences when ReplicatedBackend calls get_logger()->inc().
+    perf_logger = build_osd_logger(g_ceph_context);
+  }
+  
+  ~MockPGBackendListener() {
+    if (perf_logger) {
+      delete perf_logger;
+      perf_logger = nullptr;
+    }
+  }
+  
+  void set_store(ObjectStore *s, ObjectStore::CollectionHandle c) {
+    store = s;
+    ch = c;
+  }
+  
+  void set_event_loop(EventLoop *loop) {
+    event_loop = loop;
+  }
+  
+  void set_op_tracker(OpTracker *tracker) {
+    op_tracker = tracker;
+  }
+  
+  void set_peering_state(PeeringState *ps) {
+    peering_state = ps;
+  }
+  
+  void set_handle_message_callback(std::function<bool(OpRequestRef)> cb) {
+    handle_message_callback = cb;
+  }
+  
+  void set_message_router(std::map<int, std::function<bool(OpRequestRef)>> *router) {
+    message_router = router;
+  }
  
    // Debugging
    DoutPrefixProvider *get_dpp() override {
@@ -68,11 +128,17 @@ public:
      pg_shard_t peer,
      const hobject_t &oid,
      const ObjectRecoveryInfo &recovery_info) override {
+    if (peering_state) {
+      peering_state->on_peer_recover(peer, oid, recovery_info.version);
+    }
    }
  
    void begin_peer_recover(
      pg_shard_t peer,
      const hobject_t oid) override {
+    if (peering_state) {
+      peering_state->begin_peer_recover(peer, oid);
+    }
    }
  
    void apply_stats(
@@ -116,25 +182,93 @@ public:
      return c;
    }
  
-  // Messaging
+  // Routes messages through EventLoop for asynchronous EC message processing.
    void send_message(int to_osd, Message *m) override {
+    MessageRef mref(m);
+    sent_messages.push_back(mref);
+    sent_messages_with_dest.push_back({to_osd, mref});
+    
+    if (event_loop && op_tracker && message_router) {
+      // Capture the sender's OSD ID
+      int from_osd = pg_whoami.osd;
+      
+      // IMPORTANT: Encode the message payload to simulate network transmission
+      // This ensures that txn_payload is moved to the middle section for MOSDRepOp messages
+      // Without this, Transaction::decode will fail because the message structure is incomplete
+      mref->encode_payload(CEPH_FEATURES_ALL);
+      
+      event_loop->schedule_osd_message(to_osd, [this, mref, to_osd, from_osd]() {
+        if (!mref->get_connection()) {
+          // Set connection peer to the SENDER, not the destination
+          ConnectionRef conn = new MockConnection(from_osd);
+          mref->set_connection(conn);
+        }
+        OpRequestRef op = op_tracker->create_request<OpRequest>(mref.get());
+        
+        // Route to the correct shard's backend using the message router
+        auto it = message_router->find(to_osd);
+        if (it != message_router->end()) {
+          it->second(op);
+        }
+      });
+    }
    }
  
    void queue_transaction(
      ObjectStore::Transaction&& t,
      OpRequestRef op = OpRequestRef()) override {
+    std::vector<ObjectStore::Transaction> tls;
+    tls.push_back(std::move(t));
+    queue_transactions(tls, op);
    }
  
    void queue_transactions(
      std::vector<ObjectStore::Transaction>& tls,
      OpRequestRef op = OpRequestRef()) override {
+    if (event_loop && store && ch) {
+      // Steal the Context callbacks from the transactions before calling MemStore.
+      // This allows the test harness to manage the context callbacks itself instead of using
+      // a Finisher thread. This keeps the test harness single threaded and gives more
+      // control for ordering async replies.
+      Context *on_apply = nullptr;
+      Context *on_apply_sync = nullptr;
+      Context *on_commit = nullptr;
+      ObjectStore::Transaction::collect_contexts(tls, &on_apply, &on_commit, &on_apply_sync);
+
+      // Execute transactions through the store (without contexts - we stole them)
+      store->queue_transactions(ch, tls, TrackedOpRef(), nullptr);
+
+      // Apply the on_apply_sync synchronously. This is what queue_transactions
+      // would do anyway.
+      // NOTE: Memstore will panic rather than fail
+      if (on_apply_sync) {
+        on_apply_sync->complete(0);
+      }
+
+      if (on_apply) {
+        event_loop->schedule_transaction(pg_whoami.osd, [on_apply]() mutable {
+          on_apply->complete(0);
+        });
+      }
+      if (on_commit) {
+        event_loop->schedule_transaction(pg_whoami.osd, [on_commit]() mutable {
+          on_commit->complete(0);
+        });
+      }
+    }
    }
  
    epoch_t get_interval_start_epoch() const override {
+    if (peering_state) {
+      return peering_state->get_info().history.same_interval_since;
+    }
      return 1;
    }
  
    epoch_t get_last_peering_reset_epoch() const override {
+    if (peering_state) {
+      return peering_state->get_last_peering_reset();
+    }
      return 1;
    }
  
@@ -143,11 +277,21 @@ public:
      return shardset;
    }
  
+  const shard_id_set &get_acting_recovery_backfill_shard_id_set() const {
+    return acting_recovery_backfill_shard_id_set;
+  }
+
    const std::set<pg_shard_t> &get_acting_shards() const override {
+    if (peering_state) {
+      return peering_state->get_actingset();
+    }
      return shardset;
    }
  
    const std::set<pg_shard_t> &get_backfill_shards() const override {
+    if (peering_state) {
+      return peering_state->get_backfill_targets();
+    }
      return shardset;
    }
  
@@ -156,34 +300,68 @@ public:
    }
  
    const std::map<hobject_t, std::set<pg_shard_t>> &get_missing_loc_shards() const override {
+    if (peering_state) {
+      return peering_state->get_missing_loc().get_missing_locs();
+    }
      return missing_loc_shards;
    }
  
    const pg_missing_tracker_t &get_local_missing() const override {
+    if (peering_state) {
+      return peering_state->get_pg_log().get_missing();
+    }
      return local_missing;
    }
  
    void add_local_next_event(const pg_log_entry_t& e) override {
+    if (peering_state) {
+      peering_state->add_local_next_event(e);
+    }
    }
  
    const std::map<pg_shard_t, pg_missing_t> &get_shard_missing() const override {
+    if (peering_state) {
+      return peering_state->get_peer_missing();
+    }
      return shard_missing;
    }
  
    const pg_missing_const_i &get_shard_missing(pg_shard_t peer) const override {
+    if (peering_state) {
+      auto m = maybe_get_shard_missing(peer);
+      ceph_assert(m);
+      return *m;
+    }
      return local_missing;
    }
  
    const std::map<pg_shard_t, pg_info_t> &get_shard_info() const override {
+    if (peering_state) {
+      return peering_state->get_peer_info();
+    }
      return shard_info;
    }
  
    const PGLog &get_log() const override {
+    if (peering_state) {
+      return peering_state->get_pg_log();
+    }
      return log;
    }
  
    bool pgb_is_primary() const override {
-    return true;
+    // For peering tests, use the PeeringState's view of primary
+    if (peering_state) {
+      return peering_state->is_primary();
+    }
+    
+    // For basic tests without peering, query the OSDMap to determine primary
+    // This uses pg_temp if set, otherwise uses the CRUSH mapping
+    std::vector<int> acting;
+    int acting_primary = -1;
+    osdmap->pg_to_acting_osds(info.pgid.pgid, &acting, &acting_primary);
+    
+    return pg_whoami.osd == acting_primary;
    }
  
    const OSDMapRef& pgb_get_osdmap() const override {
@@ -195,14 +373,23 @@ public:
    }
  
    const pg_info_t &get_info() const override {
+    // When PeeringState is available, use its pg_info_t as the single source of truth
+    if (peering_state) {
+      return peering_state->get_info();
+    }
      return info;
    }
  
    const pg_pool_t &get_pool() const override {
-    return pool;
+    const pg_pool_t *p = osdmap->get_pg_pool(pool_id);
+    ceph_assert(p != nullptr);
+    return *p;
    }
  
    eversion_t get_pg_committed_to() const override {
+    if (peering_state) {
+      return peering_state->get_pg_committed_to();
+    }
      return eversion_t();
    }
  
@@ -257,6 +444,18 @@ public:
      bool transaction_applied,
      ObjectStore::Transaction &t,
      bool async = false) override {
+    // If we have a PeeringState, append the log entries to it
+    // This creates proper integration between backend operations and peering state
+    if (peering_state && !logv.empty()) {
+      peering_state->append_log(
+        std::move(logv),
+        trim_to,
+        roll_forward_to,
+        pg_committed_to,
+        t,
+        transaction_applied,
+        async);
+    }
    }
  
    void pgb_set_object_snap_mapping(
@@ -273,15 +472,31 @@ public:
    void update_peer_last_complete_ondisk(
      pg_shard_t fromosd,
      eversion_t lcod) override {
+    if (peering_state) {
+      peering_state->update_peer_last_complete_ondisk(fromosd, lcod);
+    }
    }
  
    void update_last_complete_ondisk(eversion_t lcod) override {
+    if (peering_state) {
+      peering_state->update_last_complete_ondisk(lcod);
+    }
    }
  
    void update_pct(eversion_t pct) override {
+    if (peering_state) {
+      peering_state->update_pct(pct);
+    }
    }
  
    void update_stats(const pg_stat_t &stat) override {
+    if (peering_state) {
+      peering_state->update_stats(
+        [&stat](auto &history, auto &stats) {
+          stats = stat;
+          return false;
+        });
+    }
    }
  
    void schedule_recovery_work(
@@ -302,18 +517,52 @@ public:
    }
  
    pg_shard_t primary_shard() const override {
-    return pg_shard_t();
+    if (peering_state) {
+      return peering_state->get_primary();
+    }
+    
+    // Query the OSDMap to get the current primary
+    pg_t pgid = info.pgid.pgid;
+    std::vector<int> acting;
+    int acting_primary = -1;
+    osdmap->pg_to_acting_osds(pgid, &acting, &acting_primary);
+    
+    // For EC pools, the primary shard ID matches the OSD ID in the acting set
+    // For replicated pools, use NO_SHARD
+    if (pg_whoami.shard != shard_id_t::NO_SHARD) {
+      // EC pool: find the shard ID of the acting primary in the acting set
+      shard_id_t primary_shard_id = shard_id_t::NO_SHARD;
+      for (size_t i = 0; i < acting.size(); i++) {
+        if (acting[i] == acting_primary) {
+          primary_shard_id = shard_id_t(i);
+          break;
+        }
+      }
+      return pg_shard_t(acting_primary, primary_shard_id);
+    } else {
+      // Replicated pool: use NO_SHARD
+      return pg_shard_t(acting_primary, shard_id_t::NO_SHARD);
+    }
    }
  
    uint64_t min_peer_features() const override {
+    if (peering_state) {
+      return peering_state->get_min_peer_features();
+    }
      return CEPH_FEATURES_ALL;
    }
  
    uint64_t min_upacting_features() const override {
+    if (peering_state) {
+      return peering_state->get_min_upacting_features();
+    }
      return CEPH_FEATURES_ALL;
    }
  
    pg_feature_vec_t get_pg_acting_features() const override {
+    if (peering_state) {
+      return peering_state->get_pg_acting_features();
+    }
      return pg_feature_vec_t();
    }
  
@@ -325,16 +574,24 @@ public:
  
    void send_message_osd_cluster(
      int peer, Message *m, epoch_t from_epoch) override {
+    send_message(peer, m);
    }
  
    void send_message_osd_cluster(
      std::vector<std::pair<int, Message*>>& messages, epoch_t from_epoch) override {
+    for (auto& [osd, m] : messages) {
+      send_message(osd, m);
+    }
    }
  
-  void send_message_osd_cluster(MessageRef, Connection *con) override {
+  void send_message_osd_cluster(MessageRef m, Connection *con) override {
+    MockConnection* mock_con = dynamic_cast<MockConnection*>(con);
+    send_message(mock_con->get_peer_osd(), m.get());
    }
  
    void send_message_osd_cluster(Message *m, const ConnectionRef& con) override {
+    MockConnection* mock_con = dynamic_cast<MockConnection*>(con.get());
+    send_message(mock_con->get_peer_osd(), m);
    }
  
    void start_mon_command(
@@ -352,7 +609,7 @@ public:
    }
  
    PerfCounters *get_logger() override {
-    return nullptr;
+    return perf_logger;
    }
  
    ceph_tid_t get_tid() override {
@@ -393,9 +650,57 @@ public:
    bool maybe_preempt_replica_scrub(const hobject_t& oid) override {
      return false;
    }
+  void add_temp_obj(const hobject_t &oid) override {
+  }
+
+  void clear_temp_obj(const hobject_t &oid) override {
+  }
+
+  const pg_missing_const_i * maybe_get_shard_missing(
+    pg_shard_t peer) const override {
+    if (peering_state) {
+      if (peer == peering_state->get_primary()) {
+        return &peering_state->get_pg_log().get_missing();
+      } else {
+        auto i = peering_state->get_peer_missing().find(peer);
+        if (i == peering_state->get_peer_missing().end()) {
+          return nullptr;
+        } else {
+          return &(i->second);
+        }
+      }
+    }
+    return &local_missing;
+  }
+
+  const pg_info_t &get_shard_info(pg_shard_t peer) const override {
+    if (peering_state) {
+      if (peer == peering_state->get_primary()) {
+        return peering_state->get_info();
+      } else {
+        auto i = peering_state->get_peer_info().find(peer);
+        ceph_assert(i != peering_state->get_peer_info().end());
+        return i->second;
+      }
+    }
+    
+    auto it = shard_info.find(peer);
+    if (it != shard_info.end()) {
+      return it->second;
+    }
+    return info;
+  }
+
+  bool is_missing_object(const hobject_t& oid) const override {
+    return false;
+  }
+  void send_message_osd_cluster(
+    int osd, MOSDPGPush* msg, epoch_t from_epoch) override {
+    send_message(osd, msg);
+  }
  
    struct ECListener *get_eclistener() override {
-    return nullptr;
+    return static_cast<ECListener *>(this);
    }
  };
  
diff --git a/src/test/osd/MockPGLogEntryHandler.h b/src/test/osd/MockPGLogEntryHandler.h

index 629d335de61b546668ada70ef327f4c899c1f17b..79651d9870cffadc2bb313da47ef910ed5698853 100644 (file)
--- a/src/test/osd/MockPGLogEntryHandler.h
+++ b/src/test/osd/MockPGLogEntryHandler.h
@@ -16,11 +16,7 @@
  
  #include "osd/PGLog.h"
  #include "os/ObjectStore.h"
-#include "test/osd/MockPGBackend.h"
-
-// dout using global context and OSD subsystem
-#define dout_context g_ceph_context
-#define dout_subsys ceph_subsys_osd
+#include "MockPGBackend.h"
  
  // MockPGLogEntryHandler
  //
@@ -38,34 +34,31 @@ class MockPGLogEntryHandler : public PGLog::LogEntryHandler {
  
    // LogEntryHandler
    void remove(const hobject_t &hoid) override {
-    dout(0) << "MockPGLogEntryHandler::remove " << hoid << dendl;
+    lgeneric_dout(g_ceph_context, 0) << "MockPGLogEntryHandler::remove " << hoid << dendl;
      backend->remove(hoid, t);
    }
    void try_stash(const hobject_t &hoid, version_t v) override {
-    dout(0) << "MockPGLogEntryHandler::try_stash " << hoid << " " << v << dendl;
+    lgeneric_dout(g_ceph_context, 0) << "MockPGLogEntryHandler::try_stash " << hoid << " " << v << dendl;
      backend->try_stash(hoid, v, t);
    }
    void rollback(const pg_log_entry_t &entry) override {
-    dout(0) << "MockPGLogEntryHandler::rollback " << entry << dendl;
+    lgeneric_dout(g_ceph_context, 0) << "MockPGLogEntryHandler::rollback " << entry << dendl;
      ceph_assert(entry.can_rollback());
      backend->rollback(entry, t);
    }
    void rollforward(const pg_log_entry_t &entry) override {
-    dout(0) << "MockPGLogEntryHandler::rollforward " << entry << dendl;
+    lgeneric_dout(g_ceph_context, 0) << "MockPGLogEntryHandler::rollforward " << entry << dendl;
      backend->rollforward(entry, t);
    }
    void trim(const pg_log_entry_t &entry) override {
-    dout(0) << "MockPGLogEntryHandler::trim " << entry << dendl;
+    lgeneric_dout(g_ceph_context, 0) << "MockPGLogEntryHandler::trim " << entry << dendl;
      backend->trim(entry, t);
    }
    void partial_write(pg_info_t *info, eversion_t previous_version,
                        const pg_log_entry_t &entry
      ) override {
-    dout(0) << "MockPGLogEntryHandler::partial_write " << entry << dendl;
+    lgeneric_dout(g_ceph_context, 0) << "MockPGLogEntryHandler::partial_write " << entry << dendl;
      backend->partial_write(info, previous_version, entry);
    }
  };
  
-#undef dout_context
-#undef dout_subsys
-
diff --git a/src/test/osd/MockPeeringListener.h b/src/test/osd/MockPeeringListener.h

index d2e9140c23994bd2b4a029d7d54be0ac7a0de4b8..12c05151146a76405ef12971f9522379d8f5341b 100644 (file)
--- a/src/test/osd/MockPeeringListener.h
+++ b/src/test/osd/MockPeeringListener.h
@@ -14,44 +14,32 @@
  
  #pragma once
  
-#include <list>
-#include <map>
  #include <memory>
-#include <set>
-#include <string>
  #include <vector>
+#include <list>
+#include <map>
  #include "osd/PeeringState.h"
  #include "osd/osd_perf_counters.h"
-#include "common/perf_counters_collection.h"
-#include "global/global_context.h"
+#include "common/HeartbeatMap.h"
  #include "os/ObjectStore.h"
-#include "test/osd/MockLog.h"
-#include "test/osd/MockPGBackend.h"
-#include "test/osd/MockPGBackendListener.h"
-#include "test/osd/MockPGLogEntryHandler.h"
+#include "MockPGBackendListener.h"
+#include "MockPGBackend.h"
+#include "MockPGLogEntryHandler.h"
+#include "global/global_context.h"
  
-// dout using global context and OSD subsystem
  #define dout_context g_ceph_context
  #define dout_subsys ceph_subsys_osd
  
-using namespace std;
-
-// Mock PeeringListener - stub of PeeringState::PeeringListener
-// to help with testing of PeeringState. Keep track of calls
-// from PeeringState and emulate some of PrimaryLogPG/PG
-// functionality for testing purposes.
-//
-// There are some inject_* variables that can be used to help
-// tests create race hazards or test failure paths
+// Mock implementation of PeeringState::PeeringListener for testing.
+// inject_* variables can be used to create race hazards or test failure paths.
  class MockPeeringListener : public PeeringState::PeeringListener {
   public:
    pg_shard_t pg_whoami;
-  MockLog logger;
    PeeringState *ps;
-  unique_ptr<MockPGBackendListener> backend_listener;
+  std::unique_ptr<MockPGBackendListener> backend_listener;
    coll_t coll;
    ObjectStore::CollectionHandle ch;
-  unique_ptr<MockPGBackend> backend;
+  std::unique_ptr<MockPGBackend> backend;
    PerfCounters* recoverystate_perf;
    PerfCounters* logger_perf;
    std::vector<int> next_acting;
@@ -84,19 +72,33 @@ class MockPeeringListener : public PeeringState::PeeringListener {
    // migration requests with too full
    bool inject_fail_reserve_recovery_space = false;
  
+  std::function<int(ObjectStore::Transaction&&)> queue_transaction_callback;
+
    MockPeeringListener(OSDMapRef osdmap,
-                      const pg_pool_t pi,
+                      int64_t pool_id,
                        DoutPrefixProvider *dpp,
                        pg_shard_t pg_whoami) : pg_whoami(pg_whoami) {
-    backend_listener = make_unique<MockPGBackendListener>(osdmap, pi, dpp, pg_whoami);
-    backend = make_unique<MockPGBackend>(g_ceph_context, backend_listener.get(), nullptr, coll, ch);
+    backend_listener = std::make_unique<MockPGBackendListener>(osdmap, pool_id, dpp, pg_whoami);
+    backend = std::make_unique<MockPGBackend>(g_ceph_context, backend_listener.get(), nullptr, coll, ch);
      recoverystate_perf = build_recoverystate_perf(g_ceph_context);
      g_ceph_context->get_perfcounters_collection()->add(recoverystate_perf);
      logger_perf = build_osd_logger(g_ceph_context);
      g_ceph_context->get_perfcounters_collection()->add(logger_perf);
    }
  
-  // EpochSource interface
+  ~MockPeeringListener() {
+    if (recoverystate_perf) {
+      g_ceph_context->get_perfcounters_collection()->remove(recoverystate_perf);
+      delete recoverystate_perf;
+      recoverystate_perf = nullptr;
+    }
+    if (logger_perf) {
+      g_ceph_context->get_perfcounters_collection()->remove(logger_perf);
+      delete logger_perf;
+      logger_perf = nullptr;
+    }
+  }
+
    epoch_t get_osdmap_epoch() const override {
      return current_epoch;
    }
@@ -112,6 +114,13 @@ class MockPeeringListener : public PeeringState::PeeringListener {
      bool need_write_epoch,
      ObjectStore::Transaction &t) override {
      prepare_write_called = true;
+    
+    // If a callback is set, queue the transaction
+    if (queue_transaction_callback && !t.empty()) {
+      ObjectStore::Transaction copy;
+      copy.append(t);
+      queue_transaction_callback(std::move(copy));
+    }
    }
  
    void scrub_requested(scrub_level_t scrub_level, scrub_type_t scrub_type) override {
@@ -408,14 +417,14 @@ class MockPeeringListener : public PeeringState::PeeringListener {
    }
  
    void log_state_enter(const char *state) override {
-    last_state_entered = string(state);
+    last_state_entered = std::string(state);
      state_entered = true;
    }
  
    void log_state_exit(
      const char *state_name, utime_t enter_time,
      uint64_t events, utime_t event_dur) override {
-    last_state_exited = string(state_name);
+    last_state_exited = std::string(state_name);
      state_exited = true;
    }
  
@@ -424,15 +433,15 @@ class MockPeeringListener : public PeeringState::PeeringListener {
    }
  
    OstreamTemp get_clog_info() override {
-    return logger.info();
+    return OstreamTemp(CLOG_INFO, nullptr);
    }
  
    OstreamTemp get_clog_error() override {
-    return logger.error();
+    return OstreamTemp(CLOG_ERROR, nullptr);
    }
  
    OstreamTemp get_clog_debug() override {
-    return logger.debug();
+    return OstreamTemp(CLOG_DEBUG, nullptr);
    }
  
    void on_activate_complete() override {
@@ -498,7 +507,6 @@ class MockPeeringListener : public PeeringState::PeeringListener {
      removal_called = true;
    }
  
-  // Test state tracking
    unsigned target_pg_log_entries = 100;
    bool renew_lease_scheduled = false;
    bool check_readable_queued = false;
@@ -527,9 +535,9 @@ class MockPeeringListener : public PeeringState::PeeringListener {
    bool recovery_space_reserved = false;
    bool recovery_space_unreserved = false;
    bool missing_set_rebuilt = false;
-  string last_state_entered;
+  std::string last_state_entered;
    bool state_entered = false;
-  string last_state_exited;
+  std::string last_state_exited;
    bool state_exited = false;
    mutable bool recovery_info_dumped = false;
    epoch_t current_epoch = 1;
@@ -567,6 +575,3 @@ class MockPeeringListener : public PeeringState::PeeringListener {
    bool first_write_in_interval = false;
  };
  
-#undef dout_context
-#undef dout_subsys
-
diff --git a/src/test/osd/OSDMapTestHelpers.h b/src/test/osd/OSDMapTestHelpers.h

new file mode 100644 (file)

index 0000000..916f7cc
--- /dev/null
+++ b/src/test/osd/OSDMapTestHelpers.h
@@ -0,0 +1,355 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 sts=2 expandtab
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2026 IBM
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+#include "osd/OSDMap.h"
+#include "osd/osd_types.h"
+
+// Utility functions for managing OSDMap state in tests.
+// (Previously in OSDMapTestHelpers.h — embedded here as the sole user.)
+class OSDMapTestHelpers {
+public:
+  // Add or update a pool in the OSDMap. Pass pool_id=-1 to auto-assign.
+  static int64_t add_pool(
+    OSDMap& osdmap,
+    int64_t pool_id,
+    const pg_pool_t& pool,
+    const std::string& pool_name = "")
+  {
+    if (pool_id < 0) {
+      pool_id = osdmap.get_pool_max() + 1;
+    }
+    
+    std::string name = pool_name.empty() ?
+      ("pool_" + std::to_string(pool_id)) : pool_name;
+    
+    // Use OSDMap::Incremental to properly add pool and pool name
+    // This ensures both pools map and pool_name map are updated correctly
+    OSDMap::Incremental inc(osdmap.get_epoch() + 1);
+    inc.fsid = osdmap.get_fsid();
+    inc.new_pools[pool_id] = pool;
+    inc.new_pool_names[pool_id] = name;
+    
+    osdmap.apply_incremental(inc);
+    
+    return pool_id;
+  }
+  
+  static int64_t add_pool(
+    std::shared_ptr<OSDMap> osdmap,
+    int64_t pool_id,
+    const pg_pool_t& pool,
+    const std::string& pool_name = "")
+  {
+    return add_pool(*osdmap, pool_id, pool, pool_name);
+  }
+  
+  static const pg_pool_t* get_pool(
+    const OSDMap& osdmap,
+    int64_t pool_id)
+  {
+    return osdmap.get_pg_pool(pool_id);
+  }
+  
+  static const pg_pool_t* get_pool(
+    const std::shared_ptr<OSDMap>& osdmap,
+    int64_t pool_id)
+  {
+    return get_pool(*osdmap, pool_id);
+  }
+  
+  // Set acting set for a PG using pg_temp (standard Ceph mechanism for overriding CRUSH).
+  // For EC pools with nonprimary_shards optimization, pg_temp must be stored in
+  // "primaryfirst" order (primary-capable shards first). This simulates what the
+  // monitor does in production when initially setting up pg_temp.
+  static void set_pg_acting(
+    OSDMap& osdmap,
+    pg_t pgid,
+    const std::vector<int>& acting)
+  {
+    OSDMap::Incremental inc(osdmap.get_epoch() + 1);
+    inc.fsid = osdmap.get_fsid();
+    
+    if (acting.empty()) {
+      // Empty acting set means remove pg_temp
+      inc.new_pg_temp[pgid] = mempool::osdmap::vector<int32_t>();
+    } else {
+      // For EC pools with optimizations, transform to primaryfirst order.
+      // This is used for initial setup. For dynamic changes during peering,
+      // the test should let peering detect invalid primaries and request
+      // corrections via queue_want_pg_temp().
+      std::vector<int> transformed_acting = acting;
+      const pg_pool_t* pool = osdmap.get_pg_pool(pgid.pool());
+      if (pool && pool->allows_ecoptimizations()) {
+        transformed_acting = osdmap.pgtemp_primaryfirst(*pool, acting);
+      }
+      
+      mempool::osdmap::vector<int32_t> temp_acting;
+      for (int osd : transformed_acting) {
+        temp_acting.push_back(osd);
+      }
+      inc.new_pg_temp[pgid] = temp_acting;
+    }
+    
+    osdmap.apply_incremental(inc);
+  }
+  
+  static void set_pg_acting(
+    std::shared_ptr<OSDMap> osdmap,
+    pg_t pgid,
+    const std::vector<int>& acting)
+  {
+    set_pg_acting(*osdmap, pgid, acting);
+  }
+  
+  static bool get_pg_acting(
+    const OSDMap& osdmap,
+    pg_t pgid,
+    std::vector<int>& acting)
+  {
+    acting.clear();
+    int primary;
+    osdmap.pg_to_acting_osds(pgid, &acting, &primary);
+    return !acting.empty();
+  }
+  
+  static bool get_pg_acting(
+    const std::shared_ptr<OSDMap>& osdmap,
+    pg_t pgid,
+    std::vector<int>& acting)
+  {
+    return get_pg_acting(*osdmap, pgid, acting);
+  }
+  
+  static void set_pg_acting_primary(
+    OSDMap& osdmap,
+    pg_t pgid,
+    int primary)
+  {
+    OSDMap::Incremental inc(osdmap.get_epoch() + 1);
+    inc.fsid = osdmap.get_fsid();
+    inc.new_primary_temp[pgid] = primary;
+    osdmap.apply_incremental(inc);
+  }
+  
+  static void set_pg_acting_primary(
+    std::shared_ptr<OSDMap> osdmap,
+    pg_t pgid,
+    int primary)
+  {
+    set_pg_acting_primary(*osdmap, pgid, primary);
+  }
+  
+  static bool get_pg_acting_primary(
+    const OSDMap& osdmap,
+    pg_t pgid,
+    int& primary)
+  {
+    std::vector<int> acting;
+    osdmap.pg_to_acting_osds(pgid, &acting, &primary);
+    return primary >= 0;
+  }
+  
+  static bool get_pg_acting_primary(
+    const std::shared_ptr<OSDMap>& osdmap,
+    pg_t pgid,
+    int& primary)
+  {
+    return get_pg_acting_primary(*osdmap, pgid, primary);
+  }
+  
+  static pg_pool_t create_ec_pool(
+    int k,
+    int m,
+    uint64_t stripe_width,
+    uint64_t flags,
+    int64_t pool_id = 0)
+  {
+    pg_pool_t pool;
+    pool.type = pg_pool_t::TYPE_ERASURE;
+    pool.size = k + m;
+    pool.min_size = k;
+    pool.crush_rule = 0;
+    pool.erasure_code_profile = "default";
+    pool.stripe_width = stripe_width;
+    
+    // Set flags as specified by caller
+    pool.flags = flags;
+    
+    // Only set nonprimary_shards if OPTIMIZATIONS flag is set
+    if (flags & pg_pool_t::FLAG_EC_OPTIMIZATIONS) {
+      // Mark shards 1 to k-1 (inclusive) as nonprimary
+      // Shard 0 can be primary, shards k to k+m-1 (coding shards) can be primary
+      for (int i = 1; i < k; i++) {
+        pool.nonprimary_shards.insert(shard_id_t(i));
+      }
+    }
+    
+    return pool;
+  }
+  
+  static pg_pool_t create_replicated_pool(
+    int size,
+    int min_size,
+    int64_t pool_id = 0)
+  {
+    pg_pool_t pool;
+    pool.type = pg_pool_t::TYPE_REPLICATED;
+    pool.size = size;
+    pool.min_size = min_size;
+    pool.crush_rule = 0;
+    
+    return pool;
+  }
+  
+  static void setup_ec_pg(
+    OSDMap& osdmap,
+    pg_t pgid,
+    int k,
+    int m,
+    int primary_shard = 0)
+  {
+    std::vector<int> acting;
+    for (int i = 0; i < k + m; i++) {
+      acting.push_back(i);
+    }
+    set_pg_acting(osdmap, pgid, acting);
+    // Don't set primary_temp for EC pools - let OSDMap determine the primary
+    // based on the pool's nonprimary_shards configuration
+    // set_pg_acting_primary(osdmap, pgid, primary_shard);
+  }
+  
+  static void setup_ec_pg(
+    std::shared_ptr<OSDMap> osdmap,
+    pg_t pgid,
+    int k,
+    int m,
+    int primary_shard = 0)
+  {
+    setup_ec_pg(*osdmap, pgid, k, m, primary_shard);
+  }
+
+  // Copy the pool, unset the flag, then apply via incremental.
+  static void clear_pool_flag(
+    OSDMap& osdmap,
+    int64_t pool_id,
+    uint64_t flag)
+  {
+    const pg_pool_t* existing = osdmap.get_pg_pool(pool_id);
+    ceph_assert(existing != nullptr);
+
+    pg_pool_t updated = *existing;
+    updated.unset_flag(flag);
+
+    OSDMap::Incremental inc(osdmap.get_epoch() + 1);
+    inc.fsid = osdmap.get_fsid();
+    inc.new_pools[pool_id] = updated;
+    osdmap.apply_incremental(inc);
+  }
+
+  static void clear_pool_flag(
+    std::shared_ptr<OSDMap> osdmap,
+    int64_t pool_id,
+    uint64_t flag)
+  {
+    clear_pool_flag(*osdmap, pool_id, flag);
+  }
+
+  // OSD state manipulation methods
+  
+  /**
+   * Mark an OSD as down (exists but not UP) in the OSDMap.
+   * Creates a new epoch.
+   *
+   * @param osdmap The OSDMap to modify
+   * @param osd_id The OSD to mark as down
+   */
+  static void mark_osd_down(OSDMap& osdmap, int osd_id)
+  {
+    OSDMap::Incremental inc(osdmap.get_epoch() + 1);
+    inc.fsid = osdmap.get_fsid();
+    inc.new_state[osd_id] = CEPH_OSD_EXISTS;  // Mark as down (exists but not UP)
+    osdmap.apply_incremental(inc);
+  }
+  
+  static void mark_osd_down(std::shared_ptr<OSDMap> osdmap, int osd_id)
+  {
+    mark_osd_down(*osdmap, osd_id);
+  }
+  
+  /**
+   * Mark an OSD as up in the OSDMap.
+   * Creates a new epoch.
+   *
+   * @param osdmap The OSDMap to modify
+   * @param osd_id The OSD to mark as up
+   */
+  static void mark_osd_up(OSDMap& osdmap, int osd_id)
+  {
+    OSDMap::Incremental inc(osdmap.get_epoch() + 1);
+    inc.fsid = osdmap.get_fsid();
+    inc.new_state[osd_id] = CEPH_OSD_EXISTS | CEPH_OSD_UP;
+    osdmap.apply_incremental(inc);
+  }
+  
+  static void mark_osd_up(std::shared_ptr<OSDMap> osdmap, int osd_id)
+  {
+    mark_osd_up(*osdmap, osd_id);
+  }
+  
+  /**
+   * Mark multiple OSDs as down in the OSDMap.
+   * Creates a new epoch.
+   *
+   * @param osdmap The OSDMap to modify
+   * @param osd_ids The OSDs to mark as down
+   */
+  static void mark_osds_down(OSDMap& osdmap, const std::vector<int>& osd_ids)
+  {
+    OSDMap::Incremental inc(osdmap.get_epoch() + 1);
+    inc.fsid = osdmap.get_fsid();
+    for (int osd_id : osd_ids) {
+      inc.new_state[osd_id] = CEPH_OSD_EXISTS;  // Mark as down (exists but not UP)
+    }
+    osdmap.apply_incremental(inc);
+  }
+  
+  static void mark_osds_down(std::shared_ptr<OSDMap> osdmap, const std::vector<int>& osd_ids)
+  {
+    mark_osds_down(*osdmap, osd_ids);
+  }
+  
+  /**
+   * Advance to a new epoch without changing OSD states.
+   * Useful for testing re-peering scenarios.
+   *
+   * @param osdmap The OSDMap to modify
+   */
+  static void advance_epoch(OSDMap& osdmap)
+  {
+    OSDMap::Incremental inc(osdmap.get_epoch() + 1);
+    inc.fsid = osdmap.get_fsid();
+    osdmap.apply_incremental(inc);
+  }
+  
+  static void advance_epoch(std::shared_ptr<OSDMap> osdmap)
+  {
+    advance_epoch(*osdmap);
+  }
+};
diff --git a/src/test/osd/PGBackendTestFixture.cc b/src/test/osd/PGBackendTestFixture.cc

new file mode 100644 (file)

index 0000000..c54bfc5
--- /dev/null
+++ b/src/test/osd/PGBackendTestFixture.cc
@@ -0,0 +1,553 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 sts=2 expandtab
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2026 IBM
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "test/osd/PGBackendTestFixture.h"
+#include "common/errno.h"
+
+void PGBackendTestFixture::setup_ec_pool()
+{
+  CephContext *cct = g_ceph_context;
+
+  osdmap = std::make_shared<OSDMap>();
+  osdmap->set_max_osd(k + m);
+
+  for (int i = 0; i < k + m; i++) {
+    osdmap->set_state(i, CEPH_OSD_EXISTS);
+    osdmap->set_weight(i, CEPH_OSD_OUT);
+    osdmap->crush->set_item_name(i, "osd." + std::to_string(i));
+  }
+
+  // Use incremental to set OSDs as up and with proper features
+  OSDMap::Incremental inc(osdmap->get_epoch() + 1);
+  inc.fsid = osdmap->get_fsid();
+
+  for (int i = 0; i < k + m; i++) {
+    inc.new_state[i] = CEPH_OSD_UP;
+    inc.new_weight[i] = CEPH_OSD_IN;
+
+    // Set up_thru to a high value to avoid WaitUpThru state during initial peering
+    // The OSDMap will go through several increments (adding pools, etc.) so we need
+    // up_thru to be higher than the final epoch
+    inc.new_up_thru[i] = 100;
+
+    // Set OSD features to include NAUTILUS, OCTOPUS and QUINCY server features (required for peering)
+    osd_xinfo_t xinfo;
+    xinfo.features = CEPH_FEATUREMASK_SERVER_NAUTILUS | CEPH_FEATUREMASK_SERVER_OCTOPUS | CEPH_FEATUREMASK_SERVER_QUINCY;
+    inc.new_xinfo[i] = xinfo;
+  }
+
+  // Apply the incremental to set state, weight, and features
+  // This will properly calculate up_osd_features
+  osdmap->apply_incremental(inc);
+
+  pg_pool_t pool = OSDMapTestHelpers::create_ec_pool(k, m, stripe_unit * k, pool_flags, pool_id);
+  OSDMapTestHelpers::add_pool(osdmap, pool_id, pool);
+
+  pgid = pg_t(0, pool_id);
+  spgid = spg_t(pgid, shard_id_t(0));
+
+  OSDMapTestHelpers::setup_ec_pg(osdmap, pgid, k, m, 0);
+
+  // Finalize the CRUSH map to calculate working_size
+  // This is required for crush_init_workspace() to work correctly
+  osdmap->crush->finalize();
+
+  if (ec_plugin == "mock") {
+    ec_impl = std::make_shared<MockErasureCode>(k, k + m);
+  } else {
+    ErasureCodeProfile profile;
+    profile["k"] = std::to_string(k);
+    profile["m"] = std::to_string(m);
+    profile["plugin"] = ec_plugin;
+
+    if (!ec_technique.empty()) {
+      profile["technique"] = ec_technique;
+    }
+
+    profile["stripe_unit"] = std::to_string(stripe_unit);
+
+    std::stringstream ss;
+    // Tests are run from the build directory, so "./lib" points to the
+    // erasure code plugins in the build tree rather than /usr/local/lib64/ceph/erasure-code/
+    int ret = ceph::ErasureCodePluginRegistry::instance().factory(
+      ec_plugin,
+      "./lib",
+      profile,
+      &ec_impl,
+      &ss);
+
+    if (ret != 0) {
+      FAIL() << "Failed to create EC plugin '" << ec_plugin << "': " << ss.str();
+      return;
+    }
+  }
+
+  ObjectStore::Transaction t;
+  for (int i = 0; i < k + m; i++) {
+    spg_t shard_spgid(pgid, shard_id_t(i));
+    coll_t shard_coll(shard_spgid);
+    auto shard_ch = store->create_new_collection(shard_coll);
+    t.create_collection(shard_coll, 0);
+
+    colls[i] = shard_coll;
+    chs[i] = shard_ch;
+
+    if (i == 0) {
+      ch = shard_ch;
+      coll = shard_coll;
+    }
+  }
+
+  ASSERT_EQ(store->queue_transaction(ch, std::move(t)), 0);
+
+  const pg_pool_t* pool_ptr = OSDMapTestHelpers::get_pool(osdmap, pool_id);
+  ceph_assert(pool_ptr != nullptr);
+
+  for (int i = 0; i < k + m; i++) {
+    std::unique_ptr<MockPGBackendListener> shard_listener;
+    if (listener_factory) {
+      shard_listener = listener_factory(
+        i,
+        osdmap,
+        pool_id,
+        dpp.get(),
+        pg_shard_t(i, shard_id_t(i)));
+    } else {
+      shard_listener = std::make_unique<MockPGBackendListener>(
+        osdmap,
+        pool_id,
+        dpp.get(),
+        pg_shard_t(i, shard_id_t(i))
+      );
+    }
+
+    // Initialize the listener's own info.pgid so OSDMap queries work
+    shard_listener->info.pgid = spg_t(pgid, shard_id_t(i));
+
+    for (int j = 0; j < k + m; j++) {
+      shard_listener->shardset.insert(pg_shard_t(j, shard_id_t(j)));
+      shard_listener->acting_recovery_backfill_shard_id_set.insert(shard_id_t(j));
+
+      // Initialize shard_info for each shard - required by EC backend
+      pg_info_t shard_pg_info;
+      shard_pg_info.pgid = spg_t(pgid, shard_id_t(j));
+      shard_listener->shard_info[pg_shard_t(j, shard_id_t(j))] = shard_pg_info;
+
+      // Initialize shard_missing for each shard - required by EC backend
+      pg_missing_t shard_missing;
+      shard_listener->shard_missing[pg_shard_t(j, shard_id_t(j))] = shard_missing;
+    }
+
+    shard_listener->set_store(store.get(), chs[i]);
+    shard_listener->set_event_loop(event_loop.get());
+    shard_listener->set_op_tracker(op_tracker.get());
+
+    auto shard_lru = std::make_unique<ECExtentCache::LRU>(1024 * 1024 * 100);
+    auto shard_ec_switch = std::make_unique<ECSwitch>(
+      shard_listener.get(), colls[i], chs[i], store.get(),
+      cct, ec_impl, stripe_unit * k, *shard_lru);
+
+    listeners[i] = std::move(shard_listener);
+    lrus[i] = std::move(shard_lru);
+    backends[i] = std::move(shard_ec_switch);
+  }
+
+  for (int i = 0; i < k + m; i++) {
+    message_router[i] = [this, i](OpRequestRef op) -> bool {
+      return backends[i]->_handle_message(op);
+    };
+  }
+
+  for (int i = 0; i < k + m; i++) {
+    listeners[i]->set_message_router(&message_router);
+    listeners[i]->set_handle_message_callback(
+      [this, i](OpRequestRef op) -> bool {
+        return backends[i]->_handle_message(op);
+      });
+  }
+}
+
+void PGBackendTestFixture::setup_replicated_pool()
+{
+  CephContext *cct = g_ceph_context;
+
+  osdmap = std::make_shared<OSDMap>();
+  osdmap->set_max_osd(num_replicas);
+  osdmap->set_state(0, CEPH_OSD_EXISTS | CEPH_OSD_UP);
+
+  pg_pool_t pool;
+  pool.type = pg_pool_t::TYPE_REPLICATED;
+  pool.size = num_replicas;
+  pool.min_size = min_size;
+  pool.crush_rule = 0;
+
+  osdmap->inc_epoch();
+
+  OSDMapTestHelpers::add_pool(osdmap, pool_id, pool);
+
+  // Finalize the CRUSH map to calculate working_size
+  // This is required for crush_init_workspace() to work correctly
+  osdmap->crush->finalize();
+
+  pgid = pg_t(0, pool_id);
+  spgid = spg_t(pgid, shard_id_t::NO_SHARD);
+  
+  // Set up pg_temp to define the acting set with OSD 0 as primary
+  std::vector<int> acting;
+  for (int i = 0; i < num_replicas; i++) {
+    acting.push_back(i);
+  }
+  OSDMapTestHelpers::set_pg_acting(osdmap, pgid, acting);
+  OSDMapTestHelpers::set_pg_acting_primary(osdmap, pgid, 0);
+
+  ObjectStore::Transaction t;
+  spg_t replica_spgid(pgid, shard_id_t::NO_SHARD);
+  coll_t replica_coll(replica_spgid);
+  auto replica_ch = store->create_new_collection(replica_coll);
+  t.create_collection(replica_coll, 0);
+
+  ASSERT_EQ(store->queue_transaction(replica_ch, std::move(t)), 0);
+
+  // All replicas share the same collection
+  for (int i = 0; i < num_replicas; i++) {
+    colls[i] = replica_coll;
+    chs[i] = replica_ch;
+  }
+
+  ch = replica_ch;
+  coll = replica_coll;
+
+  const pg_pool_t* pool_ptr = OSDMapTestHelpers::get_pool(osdmap, pool_id);
+  ceph_assert(pool_ptr != nullptr);
+
+  for (int i = 0; i < num_replicas; i++) {
+    std::unique_ptr<MockPGBackendListener> replica_listener;
+    if (listener_factory) {
+      replica_listener = listener_factory(
+        i,
+        osdmap,
+        pool_id,
+        dpp.get(),
+        pg_shard_t(i, shard_id_t::NO_SHARD));
+    } else {
+      replica_listener = std::make_unique<MockPGBackendListener>(
+        osdmap,
+        pool_id,
+        dpp.get(),
+        pg_shard_t(i, shard_id_t::NO_SHARD)
+      );
+    }
+
+    // Initialize the listener's own info.pgid so OSDMap queries work
+    replica_listener->info.pgid = spg_t(pgid, shard_id_t::NO_SHARD);
+
+    // For replicated pools, use NO_SHARD for all replicas
+    for (int j = 0; j < num_replicas; j++) {
+      replica_listener->shardset.insert(pg_shard_t(j, shard_id_t::NO_SHARD));
+
+      // Initialize shard_info for each replica - required by backend
+      pg_info_t replica_pg_info;
+      replica_pg_info.pgid = spg_t(pgid, shard_id_t::NO_SHARD);
+      replica_listener->shard_info[pg_shard_t(j, shard_id_t::NO_SHARD)] = replica_pg_info;
+
+      // Initialize shard_missing for each replica - required by backend
+      pg_missing_t replica_missing;
+      replica_listener->shard_missing[pg_shard_t(j, shard_id_t::NO_SHARD)] = replica_missing;
+    }
+
+    replica_listener->set_store(store.get(), chs[i]);
+    replica_listener->set_event_loop(event_loop.get());
+    replica_listener->set_op_tracker(op_tracker.get());
+
+    auto replica_backend = std::make_unique<ReplicatedBackend>(
+      replica_listener.get(), colls[i], chs[i], store.get(), cct);
+
+    listeners[i] = std::move(replica_listener);
+    backends[i] = std::move(replica_backend);
+  }
+
+  for (int i = 0; i < num_replicas; i++) {
+    message_router[i] = [this, i](OpRequestRef op) -> bool {
+      return backends[i]->_handle_message(op);
+    };
+  }
+
+  for (int i = 0; i < num_replicas; i++) {
+    listeners[i]->set_message_router(&message_router);
+    listeners[i]->set_handle_message_callback(
+      [this, i](OpRequestRef op) -> bool {
+        return backends[i]->_handle_message(op);
+      });
+  }
+}
+
+int PGBackendTestFixture::do_transaction_and_complete(
+  const hobject_t& hoid,
+  PGTransactionUPtr pg_t,
+  const object_stat_sum_t& delta_stats,
+  const eversion_t& at_version,
+  std::vector<pg_log_entry_t> log_entries)
+{
+  eversion_t trim_to(0, 0);
+  eversion_t pg_committed_to(0, 0);
+  std::optional<pg_hit_set_history_t> hset_history;
+
+  bool completed = false;
+  int completion_result = -1;
+  Context *on_complete = new LambdaContext([&completed, &completion_result](int r) {
+    completed = true;
+    completion_result = r;
+  });
+
+  ceph_tid_t tid = 1;
+  osd_reqid_t reqid(entity_name_t::OSD(0), 0, tid);
+
+  PGBackend* primary_backend = get_primary_backend();
+  ceph_assert(primary_backend != nullptr);
+  primary_backend->submit_transaction(
+    hoid,
+    delta_stats,
+    at_version,
+    std::move(pg_t),
+    trim_to,
+    pg_committed_to,
+    std::move(log_entries),
+    hset_history,
+    on_complete,
+    tid,
+    reqid,
+    OpRequestRef()
+  );
+
+  event_loop->run_until_idle(10000);
+
+  if (!completed) {
+    throw std::runtime_error("Transaction did not complete within timeout");
+  }
+
+  return completion_result;
+}
+
+int PGBackendTestFixture::create_and_write(
+  const std::string& obj_name,
+  const std::string& data,
+  const eversion_t& at_version)
+{
+  hobject_t hoid = make_test_object(obj_name);
+  PGTransactionUPtr pg_t = std::make_unique<PGTransaction>();
+  pg_t->create(hoid);
+
+  ObjectContextRef obc = make_object_context(hoid, false, 0);
+  pg_t->obc_map[hoid] = obc;
+
+  bufferlist bl;
+  bl.append(data);
+  pg_t->write(hoid, 0, bl.length(), bl);
+
+  object_stat_sum_t delta_stats;
+  delta_stats.num_objects = 1;
+  delta_stats.num_bytes = bl.length();
+
+  std::vector<pg_log_entry_t> log_entries;
+  pg_log_entry_t entry;
+  entry.mark_unrollbackable();
+  entry.op = pg_log_entry_t::MODIFY;
+  entry.soid = hoid;
+  entry.version = at_version;
+  entry.prior_version = eversion_t(0, 0);
+  log_entries.push_back(entry);
+
+  int result = do_transaction_and_complete(
+    hoid, std::move(pg_t), delta_stats, at_version, std::move(log_entries));
+
+  if (result == 0) {
+    obc->obs.exists = true;
+    obc->obs.oi.size = bl.length();
+    obc->obs.oi.version = at_version;
+  }
+
+  return result;
+}
+
+int PGBackendTestFixture::write(
+  const std::string& obj_name,
+  uint64_t offset,
+  const std::string& data,
+  const eversion_t& prior_version,
+  const eversion_t& at_version,
+  uint64_t object_size)
+{
+  hobject_t hoid = make_test_object(obj_name);
+  PGTransactionUPtr pg_t = std::make_unique<PGTransaction>();
+
+  ObjectContextRef obc = make_object_context(hoid, true, object_size);
+  obc->obs.oi.version = prior_version;
+  pg_t->obc_map[hoid] = obc;
+
+  bufferlist bl;
+  bl.append(data);
+  pg_t->write(hoid, offset, bl.length(), bl);
+
+  object_stat_sum_t delta_stats;
+  uint64_t new_size = std::max(object_size, offset + bl.length());
+  if (new_size > object_size) {
+    delta_stats.num_bytes = new_size - object_size;
+  } else {
+    delta_stats.num_bytes = 0;
+  }
+
+  std::vector<pg_log_entry_t> log_entries;
+  pg_log_entry_t entry;
+  // Don't mark as unrollbackable - partial writes need rollback support
+  entry.op = pg_log_entry_t::MODIFY;
+  entry.soid = hoid;
+  entry.version = at_version;
+  entry.prior_version = prior_version;
+  log_entries.push_back(entry);
+
+  int result = do_transaction_and_complete(
+    hoid, std::move(pg_t), delta_stats, at_version, std::move(log_entries));
+
+  if (result == 0) {
+    obc->obs.oi.size = new_size;
+    obc->obs.oi.version = at_version;
+  }
+
+  return result;
+}
+
+int PGBackendTestFixture::read_object(
+  const std::string& obj_name,
+  uint64_t offset,
+  uint64_t length,
+  bufferlist& out_data,
+  uint64_t object_size)
+{
+  hobject_t hoid = make_test_object(obj_name);
+
+  if (pool_type == EC) {
+    bool completed = false;
+    int completion_result = -1;
+
+    std::list<std::pair<ec_align_t, std::pair<bufferlist*, Context*>>> to_read;
+
+    ec_align_t align(offset, length, 0);
+
+    Context *read_complete = new LambdaContext([&completed, &completion_result](int r) {
+      completed = true;
+      completion_result = r;
+    });
+
+    to_read.push_back(std::make_pair(align, std::make_pair(&out_data, read_complete)));
+
+    Context *on_complete = new LambdaContext([](int r) {
+    });
+
+    PGBackend* primary_backend = get_primary_backend();
+    ceph_assert(primary_backend != nullptr);
+    ECSwitch* ec_switch = dynamic_cast<ECSwitch*>(primary_backend);
+    ceph_assert(ec_switch != nullptr);
+
+    ec_switch->objects_read_async(
+      hoid,
+      object_size,
+      to_read,
+      on_complete,
+      false
+    );
+
+    event_loop->run_until_idle(10000);
+
+    if (!completed) {
+      throw std::runtime_error("Read operation did not complete within timeout");
+    }
+
+    return completion_result;
+  } else {
+    PGBackend* primary_backend = get_primary_backend();
+    ceph_assert(primary_backend != nullptr);
+    ReplicatedBackend* rep_backend = dynamic_cast<ReplicatedBackend*>(primary_backend);
+    ceph_assert(rep_backend != nullptr);
+
+    int result = rep_backend->objects_read_sync(
+      hoid,
+      offset,
+      length,
+      0,
+      &out_data
+    );
+
+    return result;
+  }
+}
+
+// ---------------------------------------------------------------------------
+// NOTE: update_osdmap() intentionally does NOT reconcile listener acting sets
+//
+// This method updates only:
+//   - The fixture's osdmap pointer
+//   - The osdmap reference in all listeners
+//
+// It does NOT update the following fields on any MockPGBackendListener:
+//   - shardset
+//   - acting_recovery_backfill_shard_id_set
+//   - shard_info
+//   - shard_missing
+//
+// This is intentional: those fields describe the acting set as seen by each
+// individual OSD, and their correct values depend on the specific failure
+// scenario being simulated.  Updating them blindly here would hide bugs and
+// make it impossible to test partial-failure cases.
+//
+// Callers that need to simulate an OSD failure MUST update those fields
+// themselves before (or after) calling update_osdmap().
+//
+// See TestECFailover::simulate_osd_failure() for a worked example that
+// removes the failed shard from shardset and
+// acting_recovery_backfill_shard_id_set on every listener before delegating
+// to update_osdmap().
+// ---------------------------------------------------------------------------
+void PGBackendTestFixture::update_osdmap(
+  std::shared_ptr<OSDMap> new_osdmap,
+  std::optional<pg_shard_t> new_primary)
+{
+  // Step 1: Call on_change() on all backends to clear in-flight operations
+  for (auto& [instance, be] : backends) {
+    if (be) {
+      be->on_change();
+    }
+  }
+
+  // Step 2: Update the osdmap reference
+  osdmap = new_osdmap;
+
+  // Step 3: Update the osdmap in all listeners
+  for (auto& [instance, list] : listeners) {
+    if (list) {
+      list->osdmap = new_osdmap;
+    }
+  }
+}
+
+void PGBackendTestFixture::cleanup_data_dir()
+{
+  // Only clean up if the directory exists and hasn't been cleaned already
+  if (!data_dir.empty() && std::filesystem::exists(data_dir)) {
+    std::error_code ec;
+    std::filesystem::remove_all(data_dir, ec);
+    // Silently ignore errors during cleanup - we tried our best
+  }
+}
+
diff --git a/src/test/osd/PGBackendTestFixture.h b/src/test/osd/PGBackendTestFixture.h

new file mode 100644 (file)

index 0000000..49c703f
--- /dev/null
+++ b/src/test/osd/PGBackendTestFixture.h
@@ -0,0 +1,340 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 sts=2 expandtab
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2026 IBM
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include <filesystem>
+#include <memory>
+#include <random>
+#include <sstream>
+#include <iomanip>
+#include <gtest/gtest.h>
+#include "common/errno.h"
+#include "test/osd/MockErasureCode.h"
+#include "test/osd/MockPGBackendListener.h"
+#include "test/osd/EventLoop.h"
+#include "common/TrackedOp.h"
+#include "os/memstore/MemStore.h"
+#include "osd/ECSwitch.h"
+#include "osd/ECExtentCache.h"
+#include "osd/ReplicatedBackend.h"
+#include "osd/PGBackend.h"
+#include "osd/OSDMap.h"
+#include "osd/osd_types.h"
+#include "osd/PGTransaction.h"
+#include "common/ceph_context.h"
+#include "os/ObjectStore.h"
+#include "erasure-code/ErasureCodePlugin.h"
+#include "test/osd/OSDMapTestHelpers.h"
+
+// Unified test fixture for EC and Replicated backend tests with ObjectStore.
+// Uses PoolType to branch between EC (ECSwitch) and Replicated (ReplicatedBackend).
+class PGBackendTestFixture : public ::testing::Test {
+public:
+  enum PoolType {
+    EC,
+    REPLICATED
+  };
+
+protected:
+  PoolType pool_type;
+
+  // Pool flags to set on the EC pool (e.g., FLAG_EC_OVERWRITES, FLAG_EC_OPTIMIZATIONS).
+  // Derived classes can set this before SetUp() to configure the pool flags.
+  // setup_ec_pool() uses this value when creating the pool.
+  // Default includes both OVERWRITES and OPTIMIZATIONS flags.
+  uint64_t pool_flags = pg_pool_t::FLAG_EC_OVERWRITES | pg_pool_t::FLAG_EC_OPTIMIZATIONS;
+  
+  std::unique_ptr<MemStore> store;
+  std::string data_dir;
+  ObjectStore::CollectionHandle ch;
+  coll_t coll;
+  
+  std::shared_ptr<OSDMap> osdmap;
+  std::unique_ptr<OpTracker> op_tracker;
+  std::unique_ptr<EventLoop> event_loop;
+  std::map<int, std::function<bool(OpRequestRef)>> message_router;
+  
+  std::map<int, std::unique_ptr<MockPGBackendListener>> listeners;
+  std::map<int, std::unique_ptr<PGBackend>> backends;
+  std::map<int, coll_t> colls;
+  std::map<int, ObjectStore::CollectionHandle> chs;
+  
+  /**
+   * Optional listener factory callback.
+   *
+   * If set, setup_ec_pool() and setup_replicated_pool() will call this
+   * factory instead of constructing MockPGBackendListener directly.
+   * The factory receives the instance index and the parameters needed to
+   * construct the listener, and must return a unique_ptr to the new
+   * MockPGBackendListener.  The returned object is stored in listeners[i]
+   * as usual, so ownership stays with the base class.
+   *
+   * Derived classes (e.g. ECPeeringTestFixture) can set this in their
+   * constructor to gain direct access to the created listeners without
+   * needing to steal ownership via release_listener().
+   */
+  std::function<std::unique_ptr<MockPGBackendListener>(
+    int instance,
+    std::shared_ptr<OSDMap> osdmap,
+    int64_t pool_id,
+    DoutPrefixProvider* dpp,
+    pg_shard_t whoami)> listener_factory;
+
+  ceph::ErasureCodeInterfaceRef ec_impl;
+  std::map<int, std::unique_ptr<ECExtentCache::LRU>> lrus;
+  int k = 4;  // data chunks
+  int m = 2;  // coding chunks
+  uint64_t stripe_unit = 4096;  // aka chunk_size
+  std::string ec_plugin = "isa";
+  std::string ec_technique = "reed_sol_van";
+  
+  int num_replicas = 3;
+  int min_size = 2;
+  
+  int64_t pool_id = 0;
+  pg_t pgid;
+  spg_t spgid;
+  
+  class TestDpp : public NoDoutPrefix {
+  public:
+    TestDpp(CephContext *cct) : NoDoutPrefix(cct, ceph_subsys_osd) {}
+    
+    std::ostream& gen_prefix(std::ostream& out) const override {
+      out << "PGBackendTest: ";
+      return out;
+    }
+  };
+  std::unique_ptr<TestDpp> dpp;
+
+public:
+  explicit PGBackendTestFixture(PoolType type = EC) : pool_type(type)
+  {
+    std::random_device rd;
+    std::mt19937_64 gen(rd());
+    std::uniform_int_distribution<uint64_t> dis;
+    uint64_t random_num = dis(gen);
+    
+    std::ostringstream oss;
+    oss << "memstore_test_" << std::hex << std::setfill('0') << std::setw(16) << random_num;
+    data_dir = oss.str();
+    
+    ceph_assert(stripe_unit % 4096 == 0);
+    ceph_assert(stripe_unit != 0);
+  }
+  
+  ~PGBackendTestFixture() {
+    // Ensure cleanup happens even if TearDown() wasn't called or failed
+    cleanup_data_dir();
+  }
+  
+  void SetUp() override {
+    int r = ::mkdir(data_dir.c_str(), 0777);
+    if (r < 0) {
+      r = -errno;
+      std::cerr << __func__ << ": unable to create " << data_dir << ": " << cpp_strerror(r) << std::endl;
+    }
+    ASSERT_EQ(0, r);
+    
+    // Create MemStore - contexts are stolen by MockPGBackendListener, so we don't need manual_finisher
+    store.reset(new MemStore(g_ceph_context, data_dir));
+    ASSERT_TRUE(store);
+    ASSERT_EQ(0, store->mkfs());
+    ASSERT_EQ(0, store->mount());
+    
+    g_conf().set_safe_to_start_threads();
+    
+    CephContext *cct = g_ceph_context;
+    dpp = std::make_unique<TestDpp>(cct);
+    event_loop = std::make_unique<EventLoop>(false);
+    op_tracker = std::make_unique<OpTracker>(cct, false, 1);
+    
+    if (pool_type == EC) {
+      setup_ec_pool();
+    } else {
+      setup_replicated_pool();
+    }
+  }
+  
+  void TearDown() override {
+    // 0. Process any remaining events in the EventLoop.
+    // If the test passed, orphaned events indicate a bug - warn and skip draining
+    // so the test fails loudly.  If the test already failed, drain silently to
+    // allow the rest of TearDown to complete without cascading errors.
+    if (event_loop) {
+      if (event_loop->has_events()) {
+        if (!HasFailure()) {
+          ADD_FAILURE() << "TearDown: " << event_loop->queued_event_count()
+                        << " orphaned events remain after a passing test";
+        }
+        event_loop->run_until_idle(1000);
+      }
+    }
+    
+    // 1. Clean up all backend instances (polymorphic cleanup)
+    //    Note: We skip calling on_change() during teardown as it may access
+    //    invalid state. The backends will be destroyed anyway.
+    backends.clear();
+    
+    // 2. Clean up EC-specific resources
+    if (pool_type == EC) {
+      lrus.clear();
+      ec_impl.reset();
+    }
+    
+    // 3. Clean up listeners
+    listeners.clear();
+    
+    // 4. Reset op tracker (call on_shutdown first)
+    if (op_tracker) {
+      op_tracker->on_shutdown();
+      op_tracker.reset();
+    }
+    
+    // 5. Reset all collection handles
+    chs.clear();
+    colls.clear();
+    
+    if (ch) {
+      ch.reset();
+    }
+    
+    // 6. Unmount and destroy the store
+    if (store) {
+      store->umount();
+      store.reset();
+    }
+    
+    // 7. Clean up the test directory
+    cleanup_data_dir();
+  }
+  
+private:
+  void setup_ec_pool();
+  void setup_replicated_pool();
+  void cleanup_data_dir();
+
+public:
+  const pg_pool_t& get_pool() const {
+    const pg_pool_t* pool = OSDMapTestHelpers::get_pool(osdmap, pool_id);
+    ceph_assert(pool != nullptr);
+    return *pool;
+  }
+  
+  int get_instance_count() const {
+    return pool_type == EC ? (k + m) : num_replicas;
+  }
+  
+  int get_data_chunk_count() const {
+    return k;
+  }
+  
+  int get_coding_chunk_count() const {
+    return m;
+  }
+  
+  uint64_t get_stripe_width() const {
+    return stripe_unit * k;
+  }
+  
+  int get_min_size() const {
+    return min_size;
+  }
+  
+  // Get the primary listener and backend by checking which listener reports itself as primary
+  virtual MockPGBackendListener* get_primary_listener() {
+    for (auto& [instance, listener] : listeners) {
+      if (listener && listener->pgb_is_primary()) {
+        return listener.get();
+      }
+    }
+    return nullptr;
+  }
+  
+  virtual PGBackend* get_primary_backend() {
+    for (auto& [instance, listener] : listeners) {
+      if (listener && listener->pgb_is_primary()) {
+        auto it = backends.find(instance);
+        return (it != backends.end()) ? it->second.get() : nullptr;
+      }
+    }
+    return nullptr;
+  }
+  
+  hobject_t make_test_object(const std::string& name) const {
+    return hobject_t(object_t(name), "", CEPH_NOSNAP, 0, pool_id, "");
+  }
+  
+  ObjectContextRef make_object_context(
+    const hobject_t& hoid,
+    bool exists = false,
+    uint64_t size = 0) const
+  {
+    ObjectContextRef obc = std::make_shared<ObjectContext>();
+    obc->obs.oi = object_info_t(hoid);
+    obc->obs.oi.size = size;
+    obc->obs.exists = exists;
+    obc->ssc = nullptr;
+    return obc;
+  }
+  
+  int do_transaction_and_complete(
+    const hobject_t& hoid,
+    PGTransactionUPtr pg_t,
+    const object_stat_sum_t& delta_stats,
+    const eversion_t& at_version,
+    std::vector<pg_log_entry_t> log_entries);
+  
+  virtual int create_and_write(
+    const std::string& obj_name,
+    const std::string& data,
+    const eversion_t& at_version = eversion_t(1, 1));
+
+public:
+  
+  int write(
+    const std::string& obj_name,
+    uint64_t offset,
+    const std::string& data,
+    const eversion_t& prior_version,
+    const eversion_t& at_version,
+    uint64_t object_size);
+
+  int read_object(
+    const std::string& obj_name,
+    uint64_t offset,
+    uint64_t length,
+    bufferlist& out_data,
+    uint64_t object_size);
+
+  /**
+   * Update the OSDMap and trigger backend cleanup.
+   *
+   * Calls on_change() on all backends, then updates the osdmap reference in
+   * the fixture and all listeners.  Optionally updates the primary field on
+   * every MockPGBackendListener and the convenience pointers (listener, backend).
+   *
+   * Does NOT update acting-set fields (shardset,
+   * acting_recovery_backfill_shard_id_set, shard_info, shard_missing) on any
+   * listener — those depend on the specific failure scenario being simulated
+   * and must be updated by the caller.  See TestECFailover::simulate_osd_failure()
+   * for a worked example.
+   */
+  virtual void update_osdmap(
+    std::shared_ptr<OSDMap> new_osdmap,
+    std::optional<pg_shard_t> new_primary = std::nullopt);
+
+};
+
diff --git a/src/test/osd/TestBackendBasics.cc b/src/test/osd/TestBackendBasics.cc

new file mode 100644 (file)

index 0000000..8e81f75
--- /dev/null
+++ b/src/test/osd/TestBackendBasics.cc
@@ -0,0 +1,594 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 sts=2 expandtab
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2026 IBM
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+/*
+ * TestBackendBasics - Unified parameterized test harness for EC and Replicated
+ * backend operations.
+ *
+ * Two fixture classes are defined, each parameterized over the full set of
+ * backend configurations:
+ *
+ * TestBackendBasics
+ *   Parameterized over BackendWriteReadParam (BackendConfig × WriteReadParam).
+ *   13 backends × 8 data sizes = 104 instances per test body.
+ *
+ *   WriteThenRead  – write data, verify protocol messages, read back, verify
+ *                    data integrity.
+ *   PartialWrite   – create an object, perform a partial write at a non-zero
+ *                    offset, read back and verify all three regions.
+ *
+ * TestECFailover
+ *   Parameterized over BackendConfig (EC configs only, 12 instances).
+ *   Failover is an EC-specific concept (shard-based primary election).
+ *
+ *   BasicOSDMapUpdate – write, update OSDMap epoch, verify read still works.
+ *   PrimaryFailover   – write, fail OSD 0, verify new primary and degraded
+ *                       read with EC reconstruction.
+ */
+
+#include <gtest/gtest.h>
+#include "test/osd/PGBackendTestFixture.h"
+#include "test/osd/TestCommon.h"
+#include "messages/MOSDECSubOpWrite.h"
+
+using namespace std;
+
+// ---------------------------------------------------------------------------
+// TestBackendBasics fixture
+// ---------------------------------------------------------------------------
+
+/**
+ * TestBackendBasics - single fixture parameterized over BackendWriteReadParam.
+ *
+ * The constructor reads the BackendConfig portion of the parameter and
+ * configures the base fixture fields (pool_type, k, m, stripe_unit, ec_plugin,
+ * ec_technique, ec_optimizations, num_replicas, min_size) before SetUp() is
+ * called by GTest.
+ */
+class TestBackendBasics : public PGBackendTestFixture,
+                          public ::testing::WithParamInterface<BackendWriteReadParam> {
+public:
+  TestBackendBasics() : PGBackendTestFixture() {
+    const auto& config = GetParam().backend;
+    pool_type = config.pool_type;
+    if (pool_type == EC) {
+      k = config.k;
+      m = config.m;
+      stripe_unit = config.stripe_unit;
+      ec_plugin = config.ec_plugin;
+      ec_technique = config.ec_technique;
+      pool_flags = config.pool_flags;
+    } else {
+      num_replicas = 3;
+      min_size = 2;
+    }
+  }
+
+  void SetUp() override {
+    PGBackendTestFixture::SetUp();
+  }
+};
+
+// ---------------------------------------------------------------------------
+// TestBackendBasics: WriteThenRead
+// ---------------------------------------------------------------------------
+
+/**
+ * WriteThenRead - write data of the parameterized size, verify protocol
+ * messages were sent, read back, and verify data integrity.
+ *
+ * For EC backends: asserts that MSG_OSD_EC_WRITE messages were sent and that
+ * read messages are sent to shards.
+ * For Replicated backends: asserts that at least one message was sent.
+ */
+TEST_P(TestBackendBasics, WriteThenRead) {
+  const auto& param = GetParam().write_read;
+  const auto& backend_config = GetParam().backend;
+
+  std::string test_data(param.size, param.fill);
+  std::string obj_name = "test_backend_" + backend_config.label + "_" + param.label;
+
+  // Execute create+write operation
+  int result = create_and_write(obj_name, test_data);
+  EXPECT_EQ(result, 0) << param.label << " write should complete successfully";
+
+  // Verify messages were sent to replicas/shards
+  auto* primary_listener = get_primary_listener();
+  ASSERT_TRUE(primary_listener != nullptr) << "Primary listener should exist";
+  ASSERT_GT(primary_listener->sent_messages.size(), 0u)
+    << "Should send messages to replicas/shards";
+
+  // For EC backends: verify EC write messages were sent
+  if (backend_config.pool_type == EC) {
+    int write_messages_sent = 0;
+    for (auto msg : primary_listener->sent_messages) {
+      if (msg->get_type() == MSG_OSD_EC_WRITE) {
+        write_messages_sent++;
+      }
+    }
+    ASSERT_GT(write_messages_sent, 0) << "Should send EC write messages";
+  }
+
+  // Clear sent messages before read to distinguish read messages
+  primary_listener->sent_messages.clear();
+  primary_listener->sent_messages_with_dest.clear();
+
+  // Perform the read operation
+  bufferlist read_data;
+  int read_result = read_object(
+    obj_name,
+    0,                  // offset
+    test_data.length(), // length
+    read_data,
+    test_data.length()  // object_size
+  );
+
+  EXPECT_GE(read_result, 0) << param.label << " read should complete successfully";
+
+  // Verify data length
+  ASSERT_EQ(read_data.length(), test_data.length())
+    << param.label << " read data length should match written data length";
+
+  // Verify data content
+  std::string read_string(read_data.c_str(), read_data.length());
+  EXPECT_EQ(read_string, test_data)
+    << param.label << " read data should match written data";
+
+  // For EC backends: verify read messages were sent to shards
+  if (backend_config.pool_type == EC) {
+    primary_listener = get_primary_listener();
+    ASSERT_TRUE(primary_listener != nullptr) << "Primary listener should exist";
+    ASSERT_GT(primary_listener->sent_messages.size(), 0u)
+      << "Should send read messages to EC shards";
+  }
+
+  // All events should be processed by now
+  ASSERT_FALSE(event_loop->has_events()) << "Event loop should be idle after read";
+
+  primary_listener = get_primary_listener();
+  if (primary_listener) {
+    primary_listener->sent_messages.clear();
+  }
+}
+
+// ---------------------------------------------------------------------------
+// TestBackendBasics: PartialWrite
+// ---------------------------------------------------------------------------
+
+/**
+ * PartialWrite - create an object of the parameterized size (rounded up to a
+ * multiple of the stripe width for EC, or used directly for replicated), write
+ * a partial region at a non-zero offset, read back and verify that:
+ *   - the region before the partial write is unchanged,
+ *   - the partial-write region contains the new data,
+ *   - the region after the partial write is unchanged.
+ */
+TEST_P(TestBackendBasics, PartialWrite) {
+  const auto& param = GetParam().write_read;
+  const auto& backend_config = GetParam().backend;
+
+  std::string obj_name = "test_partial_" + backend_config.label + "_" + param.label;
+
+  // Use the parameterized size as the initial object size, but ensure it is
+  // large enough to accommodate a non-trivial partial write.  We need at least
+  // 3 regions: prefix, modified, suffix.  Use max(param.size, 3 * 4096) so
+  // that even the smallest size parameters produce a meaningful test.
+  const size_t initial_size = std::max(param.size, size_t(3 * 4096));
+
+  // Partial write covers the middle third of the object (aligned to 4 KB).
+  const size_t region = (initial_size / 3) & ~size_t(4095);  // round down to 4 KB
+  const size_t partial_offset = region ? region : 4096;
+  const size_t partial_size   = region ? region : 4096;
+
+  // Create initial data filled with the parameterized fill character
+  std::string initial_data(initial_size, param.fill);
+
+  int result = create_and_write(obj_name, initial_data, eversion_t(1, 1));
+  EXPECT_EQ(result, 0) << param.label << " initial write should complete successfully";
+
+  // Partial write data uses the next fill character (wraps around 'z' -> 'a')
+  char partial_fill = (param.fill == 'z') ? 'a' : (param.fill + 1);
+  std::string partial_data(partial_size, partial_fill);
+
+  result = write(
+    obj_name,
+    partial_offset,
+    partial_data,
+    eversion_t(1, 1),  // prior_version
+    eversion_t(1, 2),  // at_version
+    initial_size       // object_size
+  );
+  EXPECT_EQ(result, 0) << param.label << " partial write should complete successfully";
+
+  // Read back the entire object
+  bufferlist read_data;
+  int read_result = read_object(obj_name, 0, initial_size, read_data, initial_size);
+  EXPECT_GE(read_result, 0)
+    << param.label << " read after partial write should complete successfully";
+
+  ASSERT_EQ(read_data.length(), initial_size)
+    << param.label << " read data length should match object size";
+
+  const char* buf = read_data.c_str();
+
+  // Region before the partial write should be unchanged
+  for (size_t i = 0; i < partial_offset; i++) {
+    ASSERT_EQ(buf[i], param.fill)
+      << param.label << " data before partial write offset should be unchanged at position " << i;
+  }
+
+  // Partial-write region should contain the new fill character
+  for (size_t i = partial_offset; i < partial_offset + partial_size; i++) {
+    ASSERT_EQ(buf[i], partial_fill)
+      << param.label << " data at partial write region should be '" << partial_fill
+      << "' at position " << i;
+  }
+
+  // Region after the partial write should be unchanged
+  for (size_t i = partial_offset + partial_size; i < initial_size; i++) {
+    ASSERT_EQ(buf[i], param.fill)
+      << param.label << " data after partial write region should be unchanged at position " << i;
+  }
+}
+
+// ---------------------------------------------------------------------------
+// TestBackendBasics: DirectRead
+// ---------------------------------------------------------------------------
+
+/**
+ * DirectRead - test EC direct reads to individual shards.
+ *
+ * This test:
+ * 1. Skips non-optimized EC (we don't support sync reads there)
+ * 2. Writes patterned data covering an entire stripe
+ * 3. Performs sync reads to each data shard with EC_DIRECT_READ flag
+ * 4. Verifies data integrity for each shard
+ */
+TEST_P(TestBackendBasics, DirectRead) {
+  const auto& param = GetParam().write_read;
+  const auto& backend_config = GetParam().backend;
+
+  // Skip test for non-EC backends
+  if (backend_config.pool_type != EC) {
+    GTEST_SKIP() << "DirectRead test only applies to EC backends";
+  }
+
+  // Skip test for non-optimized EC - we don't support sync reads
+  if (!(backend_config.pool_flags & pg_pool_t::FLAG_EC_OPTIMIZATIONS)) {
+    GTEST_SKIP() << "DirectRead test requires optimized EC";
+  }
+
+  std::string obj_name = "test_direct_read_" + backend_config.label + "_" + param.label;
+
+  // Get stripe width from the pool
+  uint64_t stripe_width = get_stripe_width();
+
+  // Create patterned data where each stripe_unit has a distinct pattern
+  // This allows us to verify we're reading the correct shard
+  std::string test_data;
+  test_data.reserve(stripe_width);
+  
+  for (size_t i = 0; i < stripe_width; i++) {
+    // Pattern: each stripe_unit gets a different character based on its shard position
+    size_t shard_index = i / stripe_unit;
+    char fill_char = 'A' + (shard_index % 26);
+    test_data.push_back(fill_char);
+  }
+
+  // Write the data (one full stripe)
+  int result = create_and_write(obj_name, test_data);
+  EXPECT_EQ(result, 0) << param.label << " write should complete successfully";
+
+  hobject_t hoid = make_test_object(obj_name);
+
+  // Perform direct reads to each data shard (skip coding shards)
+  for (auto& [shard_id, backend] : backends) {
+    // Skip coding shards - only test data shards
+    if (shard_id >= k) {
+      continue;
+    }
+
+    ASSERT_TRUE(backend != nullptr) << "Backend for shard " << shard_id << " should not be null";
+    
+    ECSwitch* ec_switch = dynamic_cast<ECSwitch*>(backend.get());
+    ASSERT_TRUE(ec_switch != nullptr) << "Backend should be ECSwitch for EC pools";
+
+    bufferlist shard_data;
+    
+    // Perform sync read with EC_DIRECT_READ flag
+    // Read the entire stripe - we expect only this shard's data back
+    int read_result = ec_switch->objects_read_sync(
+      hoid,
+      0,                                    // offset
+      stripe_width,                         // length (full stripe)
+      CEPH_OSD_RMW_FLAG_EC_DIRECT_READ,    // op_flags with direct read flag
+      &shard_data
+    );
+
+    EXPECT_GE(read_result, 0)
+      << param.label << " direct read to shard " << shard_id << " should complete successfully";
+
+    // For direct reads, we expect to get back only the data for this shard
+    // which is one stripe_unit
+    ASSERT_EQ(shard_data.length(), stripe_unit)
+      << param.label << " shard " << shard_id << " should return " << stripe_unit << " bytes";
+
+    // Verify data integrity: this shard should contain the expected pattern
+    const char* buf = shard_data.c_str();
+    char expected_char = 'A' + (shard_id % 26);
+    
+    for (size_t i = 0; i < stripe_unit; i++) {
+      ASSERT_EQ(buf[i], expected_char)
+        << param.label << " shard " << shard_id << " byte " << i
+        << " should be '" << expected_char << "'";
+    }
+  }
+
+  // Clean up
+  auto* primary_listener = get_primary_listener();
+  if (primary_listener) {
+    primary_listener->sent_messages.clear();
+  }
+}
+
+// ---------------------------------------------------------------------------
+// Backend configurations and size parameters
+// ---------------------------------------------------------------------------
+
+namespace {
+
+const std::vector<BackendConfig> kBackendConfigs = {
+  {PGBackendTestFixture::REPLICATED, "", "", 0, 4096, 4, 2, "Replicated"},
+  {PGBackendTestFixture::EC, "isa", "reed_sol_van", pg_pool_t::FLAG_EC_OVERWRITES | pg_pool_t::FLAG_EC_OPTIMIZATIONS,  4096,  4, 2, "EC_ISA_Opt_k4m2_su4k"},
+  {PGBackendTestFixture::EC, "isa", "reed_sol_van", pg_pool_t::FLAG_EC_OVERWRITES | pg_pool_t::FLAG_EC_OPTIMIZATIONS,  8192,  4, 2, "EC_ISA_Opt_k4m2_su8k"},
+  {PGBackendTestFixture::EC, "isa", "reed_sol_van", pg_pool_t::FLAG_EC_OVERWRITES | pg_pool_t::FLAG_EC_OPTIMIZATIONS,  16384, 4, 2, "EC_ISA_Opt_k4m2_su16k"},
+  {PGBackendTestFixture::EC, "isa", "reed_sol_van", pg_pool_t::FLAG_EC_OVERWRITES | pg_pool_t::FLAG_EC_OPTIMIZATIONS,  4096,  2, 1, "EC_ISA_Opt_k2m1_su4k"},
+  {PGBackendTestFixture::EC, "isa", "reed_sol_van", pg_pool_t::FLAG_EC_OVERWRITES | pg_pool_t::FLAG_EC_OPTIMIZATIONS,  4096,  8, 3, "EC_ISA_Opt_k8m3_su4k"},
+  {PGBackendTestFixture::EC, "isa", "reed_sol_van", pg_pool_t::FLAG_EC_OVERWRITES, 4096,  4, 2, "EC_ISA_NonOpt_k4m2_su4k"},
+  {PGBackendTestFixture::EC, "jerasure", "reed_sol_van", pg_pool_t::FLAG_EC_OVERWRITES | pg_pool_t::FLAG_EC_OPTIMIZATIONS,  4096,  4, 2, "EC_Jerasure_Opt_k4m2_su4k"},
+  {PGBackendTestFixture::EC, "jerasure", "reed_sol_van", pg_pool_t::FLAG_EC_OVERWRITES | pg_pool_t::FLAG_EC_OPTIMIZATIONS,  8192,  4, 2, "EC_Jerasure_Opt_k4m2_su8k"},
+  {PGBackendTestFixture::EC, "jerasure", "reed_sol_van", pg_pool_t::FLAG_EC_OVERWRITES | pg_pool_t::FLAG_EC_OPTIMIZATIONS,  16384, 4, 2, "EC_Jerasure_Opt_k4m2_su16k"},
+  {PGBackendTestFixture::EC, "jerasure", "reed_sol_van", pg_pool_t::FLAG_EC_OVERWRITES | pg_pool_t::FLAG_EC_OPTIMIZATIONS,  4096,  2, 1, "EC_Jerasure_Opt_k2m1_su4k"},
+  {PGBackendTestFixture::EC, "jerasure", "reed_sol_van", pg_pool_t::FLAG_EC_OVERWRITES | pg_pool_t::FLAG_EC_OPTIMIZATIONS,  4096,  8, 3, "EC_Jerasure_Opt_k8m3_su4k"},
+  {PGBackendTestFixture::EC, "jerasure", "reed_sol_van", pg_pool_t::FLAG_EC_OVERWRITES, 4096,  4, 2, "EC_Jerasure_NonOpt_k4m2_su4k"},
+};
+
+const std::vector<WriteReadParam> kSizeParams = {
+  {4  * 1024,       'A', "4k"},
+  {8  * 1024,       'B', "8k"},
+  {12 * 1024,       'C', "12k"},
+  {12 * 1024 + 512, 'D', "12_5k"},
+  {16 * 1024,       'E', "16k"},
+  {31 * 1024 + 512, 'F', "31_5k"},
+  {32 * 1024,       'G', "32k"},
+  {32 * 1024 + 512, 'H', "32_5k"},
+};
+
+/**
+ * Build the cross-product of kBackendConfigs × kSizeParams.
+ */
+std::vector<BackendWriteReadParam> make_cross_product() {
+  std::vector<BackendWriteReadParam> result;
+  result.reserve(kBackendConfigs.size() * kSizeParams.size());
+  for (const auto& backend : kBackendConfigs) {
+    for (const auto& size : kSizeParams) {
+      result.push_back({backend, size});
+    }
+  }
+  return result;
+}
+
+}  // namespace
+
+// ---------------------------------------------------------------------------
+// Instantiate TestBackendBasics with the full cross-product
+// ---------------------------------------------------------------------------
+
+INSTANTIATE_TEST_SUITE_P(
+  BackendSizes,
+  TestBackendBasics,
+  ::testing::ValuesIn(make_cross_product()),
+  [](const ::testing::TestParamInfo<BackendWriteReadParam>& info) {
+    return info.param.backend.label + "_" + info.param.write_read.label;
+  }
+);
+
+// ---------------------------------------------------------------------------
+// TestECFailover fixture and tests
+// ---------------------------------------------------------------------------
+
+/**
+ * TestECFailover - tests OSDMap updates and primary failover, parameterized
+ * over all EC backend configurations.
+ *
+ * Failover is an EC-specific concept (shard-based primary election), so only
+ * EC configs are included.  The fixture reads k/m/stripe_unit/plugin/technique
+ * from the BackendConfig parameter so that every EC variant is exercised.
+ */
+class TestECFailover : public PGBackendTestFixture,
+                       public ::testing::WithParamInterface<BackendConfig> {
+public:
+  TestECFailover() : PGBackendTestFixture(PGBackendTestFixture::EC) {
+    const auto& config = GetParam();
+    k = config.k;
+    m = config.m;
+    stripe_unit = config.stripe_unit;
+    ec_plugin = config.ec_plugin;
+    ec_technique = config.ec_technique;
+    pool_flags = config.pool_flags;
+  }
+
+  void SetUp() override {
+    PGBackendTestFixture::SetUp();
+  }
+
+  void simulate_osd_failure(int failed_osd, int new_primary_instance)
+  {
+    auto new_osdmap = std::make_shared<OSDMap>();
+    new_osdmap->deepish_copy_from(*osdmap);
+
+    // Build new acting set with the failed OSD replaced by CRUSH_ITEM_NONE
+    std::vector<int> new_acting;
+    for (int i = 0; i < k+m; i++) {
+      new_acting.push_back((i == failed_osd) ? CRUSH_ITEM_NONE : i);
+    }
+    
+    // Get the pool to use pgtemp_primaryfirst transformation
+    const pg_pool_t* pool = new_osdmap->get_pg_pool(pgid.pool());
+    ceph_assert(pool != nullptr);
+    
+    // For EC pools with optimizations, pgtemp_primaryfirst reorders the acting set
+    // to put primary-eligible shards first. We need to apply this transformation
+    // before setting pg_temp so that the OSDMap will correctly identify the primary.
+    std::vector<int> transformed_acting = new_osdmap->pgtemp_primaryfirst(*pool, new_acting);
+    
+    // Use OSDMap::Incremental to set pg_temp with the transformed acting set
+    OSDMap::Incremental inc(new_osdmap->get_epoch() + 1);
+    inc.fsid = new_osdmap->get_fsid();
+    inc.new_state[failed_osd] = CEPH_OSD_EXISTS;  // Mark as down (exists but not UP)
+    
+    // Convert to mempool vector for pg_temp
+    mempool::osdmap::vector<int> pg_temp_vec(transformed_acting.begin(), transformed_acting.end());
+    inc.new_pg_temp[pgid] = pg_temp_vec;
+
+    new_osdmap->apply_incremental(inc);
+    
+    // Finalize the CRUSH map to ensure working_size is calculated
+    new_osdmap->crush->finalize();
+
+    pg_shard_t failed_shard(failed_osd, shard_id_t(failed_osd));
+    for (auto& [instance_id, list] : listeners) {
+      list->shardset.erase(failed_shard);
+      list->acting_recovery_backfill_shard_id_set.erase(shard_id_t(failed_osd));
+    }
+
+    // update_osdmap will query the OSDMap to determine the primary
+    update_osdmap(new_osdmap);
+  }
+};
+
+TEST_P(TestECFailover, BasicOSDMapUpdate) {
+  const std::string obj_name = "test_failover_object";
+  const std::string test_data = "Initial data before OSDMap change";
+
+  int result = create_and_write(obj_name, test_data);
+  EXPECT_EQ(result, 0) << "Initial write should complete successfully";
+
+  bufferlist read_data;
+  int read_result = read_object(obj_name, 0, test_data.length(), read_data, test_data.length());
+  EXPECT_GE(read_result, 0) << "Read should complete successfully";
+  ASSERT_EQ(read_data.length(), test_data.length());
+
+  auto new_osdmap = std::make_shared<OSDMap>();
+  new_osdmap->deepish_copy_from(*osdmap);
+  new_osdmap->inc_epoch();
+
+  update_osdmap(new_osdmap);
+
+  EXPECT_EQ(osdmap, new_osdmap) << "OSDMap should be updated";
+  auto* primary_listener = get_primary_listener();
+  ASSERT_TRUE(primary_listener != nullptr) << "Primary listener should exist";
+  EXPECT_EQ(primary_listener->osdmap, new_osdmap) << "Listener OSDMap should be updated";
+
+  bufferlist read_data2;
+  read_result = read_object(obj_name, 0, test_data.length(), read_data2, test_data.length());
+  EXPECT_GE(read_result, 0) << "Read after OSDMap update should complete successfully";
+  ASSERT_EQ(read_data2.length(), test_data.length());
+
+  std::string read_string(read_data2.c_str(), read_data2.length());
+  EXPECT_EQ(read_string, test_data) << "Data should match after OSDMap update";
+}
+
+TEST_P(TestECFailover, PrimaryFailover) {
+  const std::string obj_name = "test_primary_failover";
+  const std::string test_data = "Data written before primary failover";
+
+  int result = create_and_write(obj_name, test_data);
+  EXPECT_EQ(result, 0) << "Initial write should complete successfully";
+
+  bufferlist read_data;
+  int read_result = read_object(obj_name, 0, test_data.length(), read_data, test_data.length());
+  EXPECT_GE(read_result, 0) << "Read should complete successfully";
+  ASSERT_EQ(read_data.length(), test_data.length());
+
+  std::string read_string(read_data.c_str(), read_data.length());
+  EXPECT_EQ(read_string, test_data) << "Data should match before failover";
+
+  EXPECT_TRUE(listeners[0]->pgb_is_primary())
+    << "Instance 0 should be primary before failover";
+  EXPECT_FALSE(listeners[k]->pgb_is_primary())
+    << "Instance " << k << " should not be primary before failover";
+
+  // Determine expected new primary based on pool optimization
+  // For optimized EC: shards 1 to k-1 are nonprimary, so new primary will be shard k
+  // For non-optimized EC: any shard can be primary, so new primary will be shard 1
+  const pg_pool_t& pool = get_pool();
+  bool is_optimized = pool.has_flag(pg_pool_t::FLAG_EC_OPTIMIZATIONS);
+  int expected_new_primary = is_optimized ? k : 1;
+  
+  simulate_osd_failure(0, expected_new_primary);
+
+  EXPECT_FALSE(listeners[0]->pgb_is_primary())
+    << "Instance 0 should not be primary after failover";
+  EXPECT_TRUE(listeners[expected_new_primary]->pgb_is_primary())
+    << "Instance " << expected_new_primary << " should be primary after failover";
+
+  // Verify the query functions return the correct primary
+  auto* new_primary_listener = get_primary_listener();
+  auto* new_primary_backend = get_primary_backend();
+  EXPECT_EQ(new_primary_listener, listeners[expected_new_primary].get())
+    << "get_primary_listener() should return the new primary";
+  EXPECT_EQ(new_primary_backend, backends[expected_new_primary].get())
+    << "get_primary_backend() should return the new primary";
+
+  bufferlist read_data_after;
+  int read_result_after = read_object(obj_name, 0, test_data.length(), read_data_after, test_data.length());
+  EXPECT_GE(read_result_after, 0) << "Degraded read should complete successfully after failover";
+  ASSERT_EQ(read_data_after.length(), test_data.length());
+
+  std::string read_string_after(read_data_after.c_str(), read_data_after.length());
+  EXPECT_EQ(read_string_after, test_data) << "Data should match after failover with EC reconstruction";
+
+  EXPECT_TRUE(new_primary_listener != nullptr) << "Primary listener should exist after failover";
+  EXPECT_GT(new_primary_listener->osdmap->get_epoch(), 1)
+    << "OSDMap epoch should have incremented after failover";
+}
+
+// ---------------------------------------------------------------------------
+// Instantiate TestECFailover with EC-only backend configurations
+// ---------------------------------------------------------------------------
+
+namespace {
+
+std::vector<BackendConfig> make_ec_configs() {
+  std::vector<BackendConfig> ec_configs;
+  for (const auto& cfg : kBackendConfigs) {
+    if (cfg.pool_type == PGBackendTestFixture::EC) {
+      ec_configs.push_back(cfg);
+    }
+  }
+  return ec_configs;
+}
+
+}  // namespace
+
+INSTANTIATE_TEST_SUITE_P(
+  ECBackends,
+  TestECFailover,
+  ::testing::ValuesIn(make_ec_configs()),
+  [](const ::testing::TestParamInfo<BackendConfig>& info) {
+    return info.param.label;
+  }
+);
diff --git a/src/test/osd/TestCommon.h b/src/test/osd/TestCommon.h

new file mode 100644 (file)

index 0000000..4a68844
--- /dev/null
+++ b/src/test/osd/TestCommon.h
@@ -0,0 +1,63 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 sts=2 expandtab
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2026 IBM
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include <cstddef>
+#include <string>
+#include "test/osd/PGBackendTestFixture.h"
+
+/**
+ * WriteReadParam - parameter structure for write-then-read parameterized tests.
+ *
+ * Shared between test files to avoid ODR violations if both translation units
+ * are ever linked together, and to eliminate code duplication.
+ */
+struct WriteReadParam {
+  size_t size;
+  char fill;
+  std::string label;
+};
+
+/**
+ * BackendConfig - parameterizes the backend type for unified tests.
+ *
+ * Each configuration defines a pool type (EC or REPLICATED) plus
+ * EC-specific settings.  The test fixture uses this to configure
+ * PGBackendTestFixture before SetUp().
+ */
+struct BackendConfig {
+  PGBackendTestFixture::PoolType pool_type;
+  // EC-specific (ignored for REPLICATED)
+  std::string ec_plugin;     // e.g. "isa", "jerasure", "mock"
+  std::string ec_technique;  // e.g. "reed_sol_van"
+  uint64_t pool_flags;       // Pool flags (e.g., FLAG_EC_OVERWRITES | FLAG_EC_OPTIMIZATIONS)
+  uint64_t stripe_unit = 4096;  // aka chunk_size; stripe_width = stripe_unit * k
+  int k = 4;  // data chunks (EC only)
+  int m = 2;  // coding chunks (EC only)
+  // Label for test naming
+  std::string label;
+};
+
+/**
+ * BackendWriteReadParam - combined parameter for backend + write/read size tests.
+ *
+ * Used for two-level parameterization: backend configuration × data sizes.
+ */
+struct BackendWriteReadParam {
+  BackendConfig backend;
+  WriteReadParam write_read;
+};
+
diff --git a/src/test/osd/TestECFailoverWithPeering.cc b/src/test/osd/TestECFailoverWithPeering.cc

new file mode 100644 (file)

index 0000000..37fffcb
--- /dev/null
+++ b/src/test/osd/TestECFailoverWithPeering.cc
@@ -0,0 +1,463 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 sts=2 expandtab
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2026 IBM
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <gtest/gtest.h>
+#include "test/osd/ECPeeringTestFixture.h"
+
+using namespace std;
+
+class TestECFailoverWithPeering : public ECPeeringTestFixture {
+public:
+  TestECFailoverWithPeering() : ECPeeringTestFixture() {
+    k = 4;
+    m = 2;
+    stripe_unit = 4096;
+    ec_plugin = "isa";
+    ec_technique = "reed_sol_van";
+  }
+};
+
+TEST_F(TestECFailoverWithPeering, BasicPeeringCycle) {
+  run_peering_cycle();
+  
+  EXPECT_TRUE(all_shards_active()) << "All shards should be active after peering";
+  
+  // Note: In EC pools, only the primary tracks PG_STATE_CLEAN.
+  // Replicas are in ReplicaActive state and don't set the CLEAN flag.
+  // Get acting_primary from OSDMap
+  pg_t pgid = get_peering_state(0)->get_info().pgid.pgid;
+  std::vector<int> acting_osds;
+  int acting_primary = -1;
+  osdmap->pg_to_acting_osds(pgid, &acting_osds, &acting_primary);
+  
+  EXPECT_TRUE(get_peering_state(acting_primary)->is_clean())
+    << "Primary should be clean after peering";
+  
+  // Verify primary is shard 0
+  EXPECT_TRUE(get_peering_listener(0)->backend_listener->pgb_is_primary())
+    << "Shard 0 should be primary";
+  
+  for (int i = 1; i < k + m; i++) {
+    EXPECT_FALSE(get_peering_listener(i)->backend_listener->pgb_is_primary())
+      << "Shard " << i << " should not be primary";
+  }
+}
+
+TEST_F(TestECFailoverWithPeering, WriteWithPeering) {
+  run_peering_cycle();
+  ASSERT_TRUE(all_shards_active()) << "Peering must complete before write";
+  
+  const std::string obj_name = "test_write_with_peering";
+  const std::string test_data = "Data written with full peering support";
+  
+  int result = create_and_write(obj_name, test_data);
+  EXPECT_EQ(result, 0) << "Write should complete successfully";
+  
+  bufferlist read_data;
+  int read_result = read_object(obj_name, 0, test_data.length(), read_data, test_data.length());
+  EXPECT_GE(read_result, 0) << "Read should complete successfully";
+  ASSERT_EQ(read_data.length(), test_data.length());
+  
+  std::string read_string(read_data.c_str(), read_data.length());
+  EXPECT_EQ(read_string, test_data) << "Data should match";
+  
+  auto* primary_ps = get_peering_state(0);
+  EXPECT_GT(primary_ps->get_pg_log().get_log().log.size(), 0)
+    << "Primary should have log entries after write";
+}
+
+TEST_F(TestECFailoverWithPeering, OSDFailureWithPeering) {
+  run_peering_cycle();
+  ASSERT_TRUE(all_shards_active()) << "Initial peering must complete";
+  
+  const std::string obj_name = "test_osd_failure";
+  // Write 16KB but read only 8KB to force reconstruction when shard 1 is down
+  const std::string test_data(16384, 'X');  // 16KB write
+  const size_t read_length = 8192;  // 8KB read
+  
+  int result = create_and_write(obj_name, test_data);
+  EXPECT_EQ(result, 0) << "Initial write should complete";
+  
+  // Pre-failover read: measure baseline message count with all OSDs up
+  // Clear message counters first
+  for (auto& [shard, listener] : backend_listeners) {
+    listener->sent_messages.clear();
+  }
+  
+  bufferlist pre_failover_read;
+  int pre_read_result = read_object(obj_name, 0, read_length,
+                                     pre_failover_read, test_data.length());
+  EXPECT_GE(pre_read_result, 0) << "Pre-failover read should complete";
+  
+  // Count messages sent during pre-failover read
+  size_t pre_failover_msg_count = 0;
+  for (auto& [shard, listener] : backend_listeners) {
+    pre_failover_msg_count += listener->sent_messages.size();
+  }
+
+  int failed_osd = 1;  // Fail shard 1 which contains part of the data
+  
+  // Use fixture helper to mark OSD as down
+  mark_osd_down(failed_osd);
+  
+  // Primary (OSD 0) should remain active after non-primary OSD failure
+  auto* primary_ps = get_peering_state(0);
+  std::string primary_state = get_state_name(0);
+  EXPECT_TRUE(primary_state.find("Peering") != std::string::npos ||
+              primary_state.find("Active") != std::string::npos)
+    << "Primary should be peering or active after OSD failure, got: " << primary_state;
+  
+  EXPECT_TRUE(primary_ps->get_acting_recovery_backfill().count(pg_shard_t(failed_osd, shard_id_t(failed_osd))) == 0)
+    << "Failed OSD should not be in acting set";
+  
+  // Clear message counters before post-failover read
+  for (auto& [shard, listener] : backend_listeners) {
+    listener->sent_messages.clear();
+  }
+  
+  // Post-failover read: verify EC reconstruction works with one OSD down
+  bufferlist post_failover_read;
+  int post_read_result = read_object(obj_name, 0, read_length,
+                                      post_failover_read, test_data.length());
+  EXPECT_GE(post_read_result, 0) << "Read should complete successfully after OSD failure";
+  ASSERT_EQ(post_failover_read.length(), read_length)
+    << "Read length should match after OSD failure";
+  
+  std::string read_string(post_failover_read.c_str(), post_failover_read.length());
+  std::string expected_data(read_length, 'X');
+  EXPECT_EQ(read_string, expected_data)
+    << "Data should be correctly reconstructed via EC after OSD failure";
+  
+  // Count messages sent during post-failover read
+  size_t post_failover_msg_count = 0;
+  for (auto& [shard, listener] : backend_listeners) {
+    post_failover_msg_count += listener->sent_messages.size();
+  }
+  
+  // This is an 8k read of a 16k object in a 4+2 array.  This means that if shard 1
+  // is missing, then this should result in 4 reads, rather than 2 to recover.
+  EXPECT_GT(post_failover_msg_count, pre_failover_msg_count)
+    << "Post-failover read should complete successfully "
+    << "(pre: " << pre_failover_msg_count << ", post: " << post_failover_msg_count << ")";
+}
+
+TEST_F(TestECFailoverWithPeering, PrimaryFailoverWithPeering) {
+  run_peering_cycle();
+  ASSERT_TRUE(all_shards_active()) << "Initial peering must complete";
+  
+  const std::string obj_name = "test_primary_failover";
+  const std::string test_data = "Data before primary failover";
+  
+  int result = create_and_write(obj_name, test_data);
+  EXPECT_EQ(result, 0) << "Initial write should complete";
+  
+  EXPECT_TRUE(get_peering_listener(0)->backend_listener->pgb_is_primary())
+    << "Shard 0 should be primary initially";
+  
+  // Mark OSD 0 (the initial primary) as down
+  // PeeringState will automatically determine the new primary
+  mark_osd_down(0);
+  
+  // Determine the actual new primary from the OSDMap
+  int new_primary_shard = get_primary_shard_from_osdmap();
+  ASSERT_GE(new_primary_shard, 0) << "Should have a valid new primary after failover";
+  
+  // For an optimized EC pool (k=4, m=2), the new primary should be a coding shard (>= k)
+  // For a non-optimized pool, it would be shard 1
+  const pg_pool_t& pool = get_pool();
+  if (pool.allows_ecoptimizations()) {
+    EXPECT_GE(new_primary_shard, k)
+      << "New primary should be a coding shard (>= k) for optimized pool";
+  } else {
+    EXPECT_EQ(new_primary_shard, 1)
+      << "New primary should be shard 1 for non-optimized pool";
+  }
+  
+  EXPECT_TRUE(get_peering_listener(new_primary_shard)->backend_listener->pgb_is_primary())
+    << "Shard " << new_primary_shard << " should be new primary";
+  
+  EXPECT_FALSE(get_peering_listener(0)->backend_listener->pgb_is_primary())
+    << "Failed shard should not be primary";
+  
+  std::string state = get_state_name(new_primary_shard);
+  EXPECT_TRUE(state.find("Active") != std::string::npos)
+    << "New primary should be Active after failover, got: " << state;
+  
+  // Verify the PG reached Active state
+  EXPECT_TRUE(get_peering_state(new_primary_shard)->is_active())
+    << "New primary should be in Active state";
+  
+  // Verify reads work after primary failover (with EC reconstruction)
+  bufferlist read_data;
+  int read_result = read_object(obj_name, 0, test_data.length(),
+                                read_data, test_data.length());
+  EXPECT_GE(read_result, 0) << "Read should complete successfully after primary failover";
+  ASSERT_EQ(read_data.length(), test_data.length())
+    << "Read length should match after primary failover";
+  
+  std::string read_string(read_data.c_str(), read_data.length());
+  EXPECT_EQ(read_string, test_data)
+    << "Data should be correctly reconstructed via EC after primary failover";
+}
+
+TEST_F(TestECFailoverWithPeering, MultipleOSDFailuresWithPeering) {
+  run_peering_cycle();
+  ASSERT_TRUE(all_shards_active()) << "Initial peering must complete";
+  
+  const std::string obj_name = "test_multiple_failures";
+  const std::string test_data = "Data before multiple failures";
+  
+  int result = create_and_write(obj_name, test_data);
+  EXPECT_EQ(result, 0) << "Initial write should complete";
+  
+  std::vector<int> failed_osds = {1, 2};  // Fail 2 data shards
+  ASSERT_EQ(failed_osds.size(), static_cast<size_t>(m))
+    << "Should fail exactly m OSDs";
+  
+  // Use fixture helper to mark multiple OSDs as down
+  mark_osds_down(failed_osds);
+  
+  auto* primary_ps = get_peering_state(0);
+  for (int failed_osd : failed_osds) {
+    EXPECT_TRUE(primary_ps->get_acting_recovery_backfill().count(
+      pg_shard_t(failed_osd, shard_id_t(failed_osd))) == 0)
+      << "Failed OSD " << failed_osd << " should not be in acting set";
+  }
+  
+  std::string primary_state = get_state_name(0);
+  EXPECT_TRUE(primary_state.find("Peering") != std::string::npos ||
+              primary_state.find("Active") != std::string::npos ||
+              primary_state.find("Recovery") != std::string::npos)
+    << "Primary should be operational, got: " << primary_state;
+}
+
+TEST_F(TestECFailoverWithPeering, PeeringWithLogDivergence) {
+  run_peering_cycle();
+  ASSERT_TRUE(all_shards_active()) << "Initial peering must complete";
+  
+  const std::string pre_div_obj = "test_pre_divergence";
+  const std::string pre_div_data = "Data written before divergence";
+  
+  int result = create_and_write(pre_div_obj, pre_div_data, eversion_t(1, 1));
+  EXPECT_EQ(result, 0) << "Pre-divergence write should complete";
+  
+  auto* primary_ps = get_peering_state(0);
+  size_t initial_log_size = primary_ps->get_pg_log().get_log().log.size();
+  EXPECT_GT(initial_log_size, 0) << "Primary should have log entries after pre-divergence write";
+  
+  // Note: get_pg_log().get_log().head reflects the log entries added via append_log
+  eversion_t pre_div_log_head = primary_ps->get_pg_log().get_log().head;
+  EXPECT_GT(pre_div_log_head.version, 0u) << "PG log head should be non-zero after write";
+  
+  const std::string post_div_obj = "test_post_divergence";
+  const std::string post_div_data = "Data written after divergence point";
+  
+  result = create_and_write(post_div_obj, post_div_data, eversion_t(1, 2));
+  EXPECT_EQ(result, 0) << "Post-divergence write should complete";
+  
+  eversion_t post_div_log_head = primary_ps->get_pg_log().get_log().head;
+  EXPECT_GT(post_div_log_head.version, pre_div_log_head.version)
+    << "PG log head should advance after post-divergence write";
+  
+  size_t log_size_after_writes = primary_ps->get_pg_log().get_log().log.size();
+  EXPECT_GE(log_size_after_writes, initial_log_size)
+    << "Primary log should have at least as many entries after second write";
+  
+  // Trigger a new peering cycle by advancing the map to simulate re-peering
+  // after a shard had a divergent log.
+  advance_epoch();
+  
+  std::string primary_state = get_state_name(0);
+  ASSERT_TRUE(all_shards_active() ||
+              primary_state.find("Recovery") != std::string::npos ||
+              primary_state.find("Peering") != std::string::npos)
+    << "Shards should be active, recovering, or peering after map advance, got: "
+    << primary_state;
+  
+  // --- Verify pre-divergence data is readable and correct ---
+  bufferlist pre_div_read;
+  int read_result = read_object(pre_div_obj, 0, pre_div_data.length(),
+                                pre_div_read, pre_div_data.length());
+  EXPECT_GE(read_result, 0) << "Pre-divergence object should be readable after reconciliation";
+  ASSERT_EQ(pre_div_read.length(), pre_div_data.length())
+    << "Pre-divergence read length should match";
+  {
+    std::string read_str(pre_div_read.c_str(), pre_div_read.length());
+    EXPECT_EQ(read_str, pre_div_data)
+      << "Pre-divergence data should match after log reconciliation";
+  }
+  
+  // --- Verify post-divergence data is readable and correct ---
+  bufferlist post_div_read;
+  read_result = read_object(post_div_obj, 0, post_div_data.length(),
+                            post_div_read, post_div_data.length());
+  EXPECT_GE(read_result, 0) << "Post-divergence object should be readable after reconciliation";
+  ASSERT_EQ(post_div_read.length(), post_div_data.length())
+    << "Post-divergence read length should match";
+  {
+    std::string read_str(post_div_read.c_str(), post_div_read.length());
+    EXPECT_EQ(read_str, post_div_data)
+      << "Post-divergence data should match after log reconciliation";
+  }
+  
+  // After peering, the primary's PG log head should reflect all writes.
+  eversion_t primary_log_head = primary_ps->get_pg_log().get_log().head;
+  EXPECT_EQ(primary_log_head, post_div_log_head)
+    << "Primary PG log head should reflect all writes after reconciliation";
+  
+  pg_t pgid = get_peering_state(0)->get_info().pgid.pgid;
+  std::vector<int> acting_osds;
+  int acting_primary = -1;
+  osdmap->pg_to_acting_osds(pgid, &acting_osds, &acting_primary);
+
+  for (int shard : acting_osds) {
+    if (shard == CRUSH_ITEM_NONE) {
+      continue;
+    }
+    auto* shard_ps = get_peering_state(shard);
+    if (shard_ps->is_active()) {
+      eversion_t shard_info_last_update = shard_ps->get_info().last_update;
+      if (shard == acting_primary) {
+        EXPECT_EQ(shard_info_last_update, post_div_log_head)
+          << "Primary shard info.last_update should match post-divergence log head";
+      } else {
+        EXPECT_LE(shard_info_last_update, post_div_log_head)
+          << "Shard " << shard << " info.last_update should not exceed primary's log head";
+      }
+    }
+  }
+  
+  // Verify the formerly-failed shard's PG log is accessible and consistent.
+  // We use the last data shard (k-1) as the "formerly-failed" shard to check.
+  int reconciled_shard = k - 1;
+  if (reconciled_shard >= 0 && reconciled_shard < k + m) {
+    auto* reconciled_ps = get_peering_state(reconciled_shard);
+    size_t reconciled_log_size = reconciled_ps->get_pg_log().get_log().log.size();
+    auto* primary_ps_check = get_peering_state(acting_primary);
+    size_t primary_log_size = primary_ps_check->get_pg_log().get_log().log.size();
+    EXPECT_LE(reconciled_log_size, primary_log_size)
+      << "Reconciled shard " << reconciled_shard
+      << " log size should not exceed primary's log size";
+    
+    if (reconciled_ps->is_active()) {
+      eversion_t reconciled_info_lu = reconciled_ps->get_info().last_update;
+      EXPECT_LE(reconciled_info_lu, post_div_log_head)
+        << "Reconciled shard " << reconciled_shard
+        << " info.last_update should not exceed primary's log head after log reconciliation";
+    }
+  }
+}
+
+TEST_F(TestECFailoverWithPeering, RecoveryWithPeering) {
+  run_peering_cycle();
+  ASSERT_TRUE(all_shards_active()) << "Initial peering must complete";
+  
+  const std::string obj1_name = "test_recovery_obj1";
+  const std::string obj1_data = "First object data for recovery test";
+  
+  const std::string obj2_name = "test_recovery_obj2";
+  const std::string obj2_data = "Second object data for recovery test";
+  
+  int result = create_and_write(obj1_name, obj1_data, eversion_t(1, 1));
+  EXPECT_EQ(result, 0) << "First pre-failure write should complete";
+  
+  result = create_and_write(obj2_name, obj2_data, eversion_t(1, 2));
+  EXPECT_EQ(result, 0) << "Second pre-failure write should complete";
+  
+  EXPECT_TRUE(all_shards_clean()) << "All shards should be clean before recovery test";
+  
+  auto* primary_ps = get_peering_state(0);
+  eversion_t pre_failure_log_head = primary_ps->get_pg_log().get_log().head;
+  EXPECT_GT(pre_failure_log_head.version, 0u)
+    << "Primary should have log entries before failure";
+  
+  int failed_osd = k - 1;  // Last data shard
+  
+  // Use fixture helper to mark OSD as down
+  mark_osd_down(failed_osd);
+  
+  std::string state_after_failure = get_state_name(0);
+  ASSERT_TRUE(all_shards_active() ||
+              state_after_failure.find("Recovery") != std::string::npos ||
+              state_after_failure.find("Peering") != std::string::npos)
+    << "PG should be active, recovering, or peering after OSD failure, got: "
+    << state_after_failure;
+  
+  // EC can reconstruct data from remaining k shards even with one shard missing
+  bufferlist obj1_read;
+  int read_result = read_object(obj1_name, 0, obj1_data.length(),
+                                obj1_read, obj1_data.length());
+  EXPECT_GE(read_result, 0) << "First object should be readable after OSD failure";
+  ASSERT_EQ(obj1_read.length(), obj1_data.length())
+    << "First object read length should match after failure";
+  {
+    std::string read_str(obj1_read.c_str(), obj1_read.length());
+    EXPECT_EQ(read_str, obj1_data)
+      << "First object data should be correct after OSD failure (EC reconstruction)";
+  }
+  
+  bufferlist obj2_read;
+  read_result = read_object(obj2_name, 0, obj2_data.length(),
+                            obj2_read, obj2_data.length());
+  EXPECT_GE(read_result, 0) << "Second object should be readable after OSD failure";
+  ASSERT_EQ(obj2_read.length(), obj2_data.length())
+    << "Second object read length should match after failure";
+  {
+    std::string read_str(obj2_read.c_str(), obj2_read.length());
+    EXPECT_EQ(read_str, obj2_data)
+      << "Second object data should be correct after OSD failure (EC reconstruction)";
+  }
+  
+  const std::string post_recovery_obj = "test_post_recovery";
+  const std::string post_recovery_data = "Data written after OSD failure and recovery";
+  
+  result = create_and_write(post_recovery_obj, post_recovery_data, eversion_t(1, 3));
+  EXPECT_EQ(result, 0) << "Write after OSD failure should complete successfully";
+  
+  bufferlist post_recovery_read;
+  read_result = read_object(post_recovery_obj, 0, post_recovery_data.length(),
+                            post_recovery_read, post_recovery_data.length());
+  EXPECT_GE(read_result, 0) << "Post-recovery object should be readable";
+  ASSERT_EQ(post_recovery_read.length(), post_recovery_data.length())
+    << "Post-recovery read length should match";
+  {
+    std::string read_str(post_recovery_read.c_str(), post_recovery_read.length());
+    EXPECT_EQ(read_str, post_recovery_data)
+      << "Post-recovery data should match what was written";
+  }
+  
+  eversion_t post_recovery_log_head = primary_ps->get_pg_log().get_log().head;
+  EXPECT_GT(post_recovery_log_head.version, pre_failure_log_head.version)
+    << "Primary PG log head should advance after post-recovery write";
+  
+  // Even though the OSD is "down", its PeeringState still holds the log
+  // from before it went down.
+  auto* failed_ps = get_peering_state(failed_osd);
+  EXPECT_TRUE(failed_ps != nullptr) << "Failed OSD's PeeringState should still exist";
+  
+  size_t primary_log_size = primary_ps->get_pg_log().get_log().log.size();
+  size_t failed_log_size = failed_ps->get_pg_log().get_log().log.size();
+  EXPECT_LE(failed_log_size, primary_log_size)
+    << "Failed OSD's PG log size should not exceed primary's log size";
+  // The primary wrote 3 objects (obj1, obj2, post_recovery_obj), so its log must be non-empty.
+  EXPECT_GT(primary_log_size, 0u)
+    << "Primary PG log should have entries after 3 writes";
+  
+  auto* listener_ptr = get_peering_listener(0);
+  EXPECT_TRUE(listener_ptr != nullptr) << "Peering listener should exist";
+  EXPECT_TRUE(listener_ptr->activate_complete_called)
+    << "on_activate_complete should have been called during peering";
+}
+
diff --git a/src/test/osd/TestPeeringState.cc b/src/test/osd/TestPeeringState.cc

index 1013cc30e3b6c73f89a188d7d98ad1f69342ed96..85c373ffca8545426321f46e05f408e77f727eb8 100644 (file)
--- a/src/test/osd/TestPeeringState.cc
+++ b/src/test/osd/TestPeeringState.cc
@@ -47,7 +47,6 @@
  
  using namespace std;
  
-
  IsPGRecoverablePredicate *get_is_recoverable_predicate() {
    return new MockECRecPred();
  }
@@ -56,7 +55,6 @@ IsPGReadablePredicate *get_is_readable_predicate() {
    return new MockECReadPred();
  }
  
-
  // Test fixture for PeeringState tests
  class PeeringStateTest : public ::testing::Test {
  protected:
@@ -446,8 +444,15 @@ protected:
        for (auto it = ls.begin(); it != ls.end();) {
          MessageRef m = *it;
          it = ls.erase(it);
-        // TODO : Should handle messages other than MOSDPeeringOp events, however
-        // for now this seems to be sufficient
+        // NOTE: This dispatcher only handles MOSDPeeringOp-derived messages (MOSDPGLog,
+        // MOSDPGNotify2, MOSDPGInfo2, MOSDPGLease, MOSDPGLeaseAck, MOSDPGQuery2, MOSDPGTrim).
+        // Non-peering messages like MOSDPGRemove and MRecoveryReserve are sent via
+        // send_cluster_message() but are not dispatched through this function - they are
+        // handled by other test mechanisms or are not relevant to peering state transitions.
+        // This is sufficient for testing PeeringState behavior as all peering-related
+        // messages derive from MOSDPeeringOp and provide get_event() for state machine events.
+        // Future enhancement: If testing non-peering cluster messages becomes necessary,
+        // add type checking and appropriate handling for Message-derived (non-MOSDPeeringOp) types.
          dout(0) << __func__ << " message type = " << m->get_type() << dendl;
          MOSDPeeringOp *pm = static_cast<MOSDPeeringOp*>(m.get());
          dout(0) << __func__ << " sending from osd." << fromosd << " to osd." << osd << " " << *pm << dendl;
@@ -552,7 +557,7 @@ protected:
      PGPool pool(osdmap, pool_id, pi, osdmap->get_pool_name(pool_id));
      dpp[osd] = make_unique<DppHelper>(g_ceph_context, dout_subsys, this, osd, shard);
      spg_t spgid = spg_t(pg_t(0, pool_id), pg_whoami.shard);
-    listeners[osd] = make_unique<MockPeeringListener>(osdmap, pi, get_dpp(osd), pg_whoami);
+    listeners[osd] = make_unique<MockPeeringListener>(osdmap, pool_id, get_dpp(osd), pg_whoami);
      get_listener(osd)->current_epoch = osdmap->get_epoch();
      unique_ptr<PeeringState> ps = make_unique<PeeringState>(
        g_ceph_context,
author	Alex Ainscow <aainscow@uk.ibm.com>
	Sun, 1 Mar 2026 22:33:52 +0000 (22:33 +0000)
committer	Alex Ainscow <aainscow@uk.ibm.com>
	Tue, 24 Mar 2026 17:33:55 +0000 (17:33 +0000)
src/test/osd/CMakeLists.txt		patch \| blob \| history
src/test/osd/ECPeeringTestFixture.cc	[new file with mode: 0644]	patch \| blob
src/test/osd/ECPeeringTestFixture.h	[new file with mode: 0644]	patch \| blob
src/test/osd/EventLoop.h	[new file with mode: 0644]	patch \| blob
src/test/osd/MockConnection.h		patch \| blob \| history
src/test/osd/MockECReadPred.h		patch \| blob \| history
src/test/osd/MockECRecPred.h		patch \| blob \| history
src/test/osd/MockLog.h	[deleted file]	patch \| blob \| history
src/test/osd/MockPGBackend.h		patch \| blob \| history
src/test/osd/MockPGBackendListener.h		patch \| blob \| history
src/test/osd/MockPGLogEntryHandler.h		patch \| blob \| history
src/test/osd/MockPeeringListener.h		patch \| blob \| history
src/test/osd/OSDMapTestHelpers.h	[new file with mode: 0644]	patch \| blob
src/test/osd/PGBackendTestFixture.cc	[new file with mode: 0644]	patch \| blob
src/test/osd/PGBackendTestFixture.h	[new file with mode: 0644]	patch \| blob
src/test/osd/TestBackendBasics.cc	[new file with mode: 0644]	patch \| blob
src/test/osd/TestCommon.h	[new file with mode: 0644]	patch \| blob
src/test/osd/TestECFailoverWithPeering.cc	[new file with mode: 0644]	patch \| blob
src/test/osd/TestPeeringState.cc		patch \| blob \| history