Optimized EC pools support partial writes that do not update every shard.
Consequently shards that are not updated can have out of date version
numbers. The primary shard object_info_t is always updated and tracks the
expected version of each shards. To avoid unnecessary backfill work changes
are required to use the extra data in the object_info_t when comparing version
numbers to work out whether a shard is missing updates or just didn't
participate in recent partial writes.
See comments in src/osd/recovery_types.h
Signed-off-by: Bill Scales <bill_scales@uk.ibm.com>
${PROJECT_SOURCE_DIR}/src/osd/MissingLoc.cc
${PROJECT_SOURCE_DIR}/src/osd/PGLog.cc
${PROJECT_SOURCE_DIR}/src/osd/SnapMapper.cc
- ${PROJECT_SOURCE_DIR}/src/osd/recovery_types.cc
${PROJECT_SOURCE_DIR}/src/osd/osd_perf_counters.cc
${PROJECT_SOURCE_DIR}/src/mgr/OSDPerfMetricTypes.cc
watch.cc
const std::vector<pg_shard_t> &peers) override {
return peering_state.prepare_backfill_for_missing(soid, v, peers);
}
+
+ const pg_pool_t& get_pool() const override {
+ return peering_state.get_pgpool().info;
+ }
+
PeeringFacade(PeeringState& peering_state)
: peering_state(peering_state) {
}
ceph_assert(backfill_state().last_backfill_started == \
peering_state().earliest_backfill());
ceph_assert(peering_state().is_backfilling());
- // initialize BackfillIntervals
+ // initialize ReplicaBackfillIntervals
for (const auto& bt : peering_state().get_backfill_targets()) {
backfill_state().peer_backfill_info[bt].reset(
peering_state().get_peer_last_backfill(bt));
if (e.is_update()) {
DEBUGDPP("maybe_update_range(lambda): {} updated to ver {}",
pg(), e.soid, e.version);
- primary_bi.objects.erase(e.soid);
- primary_bi.objects.insert(std::make_pair(e.soid,
- e.version));
+ if (e.written_shards.empty()) {
+ // Log entry updates all shards, replace all entries for e.soid
+ primary_bi.objects.erase(e.soid);
+ primary_bi.objects.insert(
+ std::make_pair(e.soid,
+ std::make_pair(shard_id_t::NO_SHARD,
+ e.version)));
+ } else {
+ // Update backfill interval for shards modified by log entry
+ std::map<shard_id_t,eversion_t> versions;
+ // Create map from existing entries in backfill entry
+ const auto & [begin, end] = primary_bi.objects.equal_range(e.soid);
+ for (const auto & entry : std::ranges::subrange(begin, end)) {
+ const auto & [shard, version] = entry.second;
+ versions[shard] = version;
+ }
+ // Update entries in map that are modified by log entry
+ bool uses_default = false;
+ for (const auto & shard : peering_state().get_backfill_targets()) {
+ if (e.is_written_shard(shard.shard)) {
+ versions.erase(shard.shard);
+ uses_default = true;
+ } else {
+ if (!versions.contains(shard.shard)) {
+ versions[shard.shard] = e.prior_version;
+ }
+ //Else: keep existing version
+ }
+ }
+ if (uses_default) {
+ versions[shard_id_t::NO_SHARD] = e.version;
+ } else {
+ versions.erase(shard_id_t::NO_SHARD);
+ }
+ // Erase and recreate backfill interval for e.soid using map
+ primary_bi.objects.erase(e.soid);
+ for (auto & [shard, version] : versions) {
+ primary_bi.objects.insert(
+ std::make_pair(e.soid,
+ std::make_pair(shard, version)));
+ }
+ }
} else if (e.is_delete()) {
DEBUGDPP("maybe_update_range(lambda): {} removed",
pg(), e.soid);
- primary_bi.objects.erase(e.soid);
+ primary_bi.objects.erase(e.soid); // Erase all entries for e.soid
}
}
};
/* static */ bool BackfillState::Enqueuing::all_enqueued(
const PeeringFacade& peering_state,
- const BackfillInterval& backfill_info,
- const std::map<pg_shard_t, BackfillInterval>& peer_backfill_info)
+ const PrimaryBackfillInterval& backfill_info,
+ const std::map<pg_shard_t, ReplicaBackfillInterval>& peer_backfill_info)
{
const bool all_local_enqueued = \
backfill_info.extends_to_end() && backfill_info.empty();
}
hobject_t BackfillState::Enqueuing::earliest_peer_backfill(
- const std::map<pg_shard_t, BackfillInterval>& peer_backfill_info) const
+ const std::map<pg_shard_t,
+ ReplicaBackfillInterval>& peer_backfill_info) const
{
hobject_t e = hobject_t::get_max();
for (const pg_shard_t& bt : peering_state().get_backfill_targets()) {
}
bool BackfillState::Enqueuing::should_rescan_replicas(
- const std::map<pg_shard_t, BackfillInterval>& peer_backfill_info,
- const BackfillInterval& backfill_info) const
+ const std::map<pg_shard_t, ReplicaBackfillInterval>& peer_backfill_info,
+ const PrimaryBackfillInterval& backfill_info) const
{
const auto& targets = peering_state().get_backfill_targets();
return std::any_of(std::begin(targets), std::end(targets),
}
bool BackfillState::Enqueuing::should_rescan_primary(
- const std::map<pg_shard_t, BackfillInterval>& peer_backfill_info,
- const BackfillInterval& backfill_info) const
+ const std::map<pg_shard_t, ReplicaBackfillInterval>& peer_backfill_info,
+ const PrimaryBackfillInterval& backfill_info) const
{
return backfill_info.begin <= earliest_peer_backfill(peer_backfill_info) &&
!backfill_info.extends_to_end() && backfill_info.empty();
void BackfillState::Enqueuing::trim_backfilled_object_from_intervals(
BackfillState::Enqueuing::result_t&& result,
hobject_t& last_backfill_started,
- std::map<pg_shard_t, BackfillInterval>& peer_backfill_info)
+ std::map<pg_shard_t, ReplicaBackfillInterval>& peer_backfill_info)
{
std::for_each(std::begin(result.pbi_targets), std::end(result.pbi_targets),
[&peer_backfill_info] (const auto& bt) {
result_t result { {}, primary_bi.begin };
std::map<hobject_t, std::pair<eversion_t, std::vector<pg_shard_t>>> backfills;
+ std::map<shard_id_t,eversion_t> versions;
+ auto it = primary_bi.objects.begin();
+ const hobject_t& hoid = it->first;
+ eversion_t obj_v;
+ while (it != primary_bi.objects.end() && it->first == hoid) {
+ obj_v = std::max(obj_v, it->second.second);
+ versions[it->second.first] = it->second.second;
+ ++it;
+ }
+
for (const auto& bt : peering_state().get_backfill_targets()) {
const auto& peer_bi = backfill_state().peer_backfill_info.at(bt);
// Find all check peers that have the wrong version
- if (const eversion_t& obj_v = primary_bi.objects.begin()->second;
- check == primary_bi.begin && check == peer_bi.begin) {
- if (peer_bi.objects.begin()->second != obj_v) {
+ if (check == primary_bi.begin && check == peer_bi.begin) {
+ eversion_t replicaobj_v;
+ if (versions.contains(bt.shard)) {
+ replicaobj_v = versions.at(bt.shard);
+ } else {
+ replicaobj_v = versions.at(shard_id_t::NO_SHARD);
+ }
+ if (peer_bi.objects.begin()->second != replicaobj_v) {
std::ignore = backfill_state().progress_tracker->enqueue_push(
primary_bi.begin);
auto &[v, peers] = backfills[primary_bi.begin];
}
bool BackfillState::Enqueuing::Enqueuing::all_emptied(
- const BackfillInterval& local_backfill_info,
- const std::map<pg_shard_t, BackfillInterval>& peer_backfill_info) const
+ const PrimaryBackfillInterval& local_backfill_info,
+ const std::map<pg_shard_t,
+ ReplicaBackfillInterval>& peer_backfill_info) const
{
const auto& targets = peering_state().get_backfill_targets();
const auto replicas_emptied =
// -- ReplicasScanning
bool BackfillState::ReplicasScanning::replica_needs_scan(
- const BackfillInterval& replica_backfill_info,
- const BackfillInterval& local_backfill_info)
+ const ReplicaBackfillInterval& replica_backfill_info,
+ const PrimaryBackfillInterval& local_backfill_info)
{
return replica_backfill_info.empty() && \
replica_backfill_info.begin <= local_backfill_info.begin && \
#include "osd/recovery_types.h"
#include "osd/PGLog.h"
+#include "osd/PeeringState.h"
namespace crimson::osd {
// events comes first
struct PrimaryScanned : sc::event<PrimaryScanned> {
- BackfillInterval result;
- PrimaryScanned(BackfillInterval&& result)
+ PrimaryBackfillInterval result;
+ PrimaryScanned(PrimaryBackfillInterval&& result)
: result(std::move(result)) {
}
};
struct ReplicaScanned : sc::event<ReplicaScanned> {
pg_shard_t from;
- BackfillInterval result;
- ReplicaScanned(pg_shard_t from, BackfillInterval&& result)
+ ReplicaBackfillInterval result;
+ ReplicaScanned(pg_shard_t from, ReplicaBackfillInterval&& result)
: from(std::move(from)),
result(std::move(result)) {
}
// completed yet.
static bool all_enqueued(
const PeeringFacade& peering_state,
- const BackfillInterval& backfill_info,
- const std::map<pg_shard_t, BackfillInterval>& peer_backfill_info);
+ const PrimaryBackfillInterval& backfill_info,
+ const std::map<pg_shard_t, ReplicaBackfillInterval>& peer_backfill_info);
private:
void maybe_update_range();
// these methods take BackfillIntervals instead of extracting them from
// the state to emphasize the relationships across the main loop.
bool all_emptied(
- const BackfillInterval& local_backfill_info,
- const std::map<pg_shard_t, BackfillInterval>& peer_backfill_info) const;
+ const PrimaryBackfillInterval& local_backfill_info,
+ const std::map<pg_shard_t,
+ ReplicaBackfillInterval>& peer_backfill_info) const;
hobject_t earliest_peer_backfill(
- const std::map<pg_shard_t, BackfillInterval>& peer_backfill_info) const;
+ const std::map<pg_shard_t,
+ ReplicaBackfillInterval>& peer_backfill_info) const;
bool should_rescan_replicas(
- const std::map<pg_shard_t, BackfillInterval>& peer_backfill_info,
- const BackfillInterval& backfill_info) const;
+ const std::map<pg_shard_t, ReplicaBackfillInterval>& peer_backfill_info,
+ const PrimaryBackfillInterval& backfill_info) const;
// indicate whether a particular acting primary needs to scanned again
// to process next piece of the hobject_t's namespace.
// the logic is per analogy to replica_needs_scan(). See comments there.
bool should_rescan_primary(
- const std::map<pg_shard_t, BackfillInterval>& peer_backfill_info,
- const BackfillInterval& backfill_info) const;
+ const std::map<pg_shard_t, ReplicaBackfillInterval>& peer_backfill_info,
+ const PrimaryBackfillInterval& backfill_info) const;
// the result_t is intermediary between {remove,update}_on_peers() and
- // updating BackfillIntervals in trim_backfilled_object_from_intervals.
- // This step is important because it affects the main loop's condition,
- // and thus deserves to be exposed instead of being called deeply from
- // {remove,update}_on_peers().
+ // updating ReplicaBackfillIntervals in
+ // trim_backfilled_object_from_intervals. This step is important
+ // because it affects the main loop's condition, and thus deserves to be
+ // exposed instead of being called deeply from {remove,update}_on_peers().
struct [[nodiscard]] result_t {
std::set<pg_shard_t> pbi_targets;
hobject_t new_last_backfill_started;
void trim_backfilled_object_from_intervals(
result_t&&,
hobject_t& last_backfill_started,
- std::map<pg_shard_t, BackfillInterval>& peer_backfill_info);
+ std::map<pg_shard_t, ReplicaBackfillInterval>& peer_backfill_info);
result_t remove_on_peers(const hobject_t& check);
result_t update_on_peers(const hobject_t& check);
};
sc::result react(Triggered);
// indicate whether a particular peer should be scanned to retrieve
- // BackfillInterval for new range of hobject_t namespace.
+ // ReplicaBackfillInterval for new range of hobject_t namespace.
// true when bi.objects is exhausted, replica bi's end is not MAX,
// and primary bi'begin is further than the replica's one.
static bool replica_needs_scan(
- const BackfillInterval& replica_backfill_info,
- const BackfillInterval& local_backfill_info);
+ const ReplicaBackfillInterval& replica_backfill_info,
+ const PrimaryBackfillInterval& local_backfill_info);
private:
std::set<pg_shard_t> waiting_on_backfill;
backfill_suspend_state.should_go_enqueuing = true;
}
hobject_t last_backfill_started;
- BackfillInterval backfill_info;
- std::map<pg_shard_t, BackfillInterval> peer_backfill_info;
+ PrimaryBackfillInterval backfill_info;
+ std::map<pg_shard_t, ReplicaBackfillInterval> peer_backfill_info;
BackfillMachine backfill_machine;
std::unique_ptr<ProgressTracker> progress_tracker;
size_t replicas_in_backfill = 0;
const hobject_t &soid,
const eversion_t &v,
const std::vector<pg_shard_t> &peers) = 0;
+ virtual const pg_pool_t& get_pool() const = 0;
virtual ~PeeringFacade() {}
};
{
logger().debug("{}", __func__);
using crimson::common::local_conf;
- std::ignore = pg->get_recovery_backend()->scan_for_backfill(
+ std::ignore = pg->get_recovery_backend()->scan_for_backfill_primary(
begin,
local_conf()->osd_backfill_scan_min,
- local_conf()->osd_backfill_scan_max
- ).then_interruptible([this] (BackfillInterval bi) {
+ local_conf()->osd_backfill_scan_max,
+ pg->get_peering_state().get_backfill_targets()
+ ).then_interruptible([this] (PrimaryBackfillInterval bi) {
logger().debug("request_primary_scan:{}", __func__);
using BackfillState = crimson::osd::BackfillState;
backfill_state->process_event(
pg.get_collection_ref(), std::move(t)).or_terminate());
}
-RecoveryBackend::interruptible_future<BackfillInterval>
-RecoveryBackend::scan_for_backfill(
+RecoveryBackend::interruptible_future<PrimaryBackfillInterval>
+RecoveryBackend::scan_for_backfill_primary(
+ const hobject_t start,
+ [[maybe_unused]] const std::int64_t min,
+ const std::int64_t max,
+ const std::set<pg_shard_t> &backfill_targets)
+{
+ LOG_PREFIX(RecoveryBackend::scan_for_backfill_primary);
+ DEBUGDPP("starting from {}", pg, start);
+ auto version_map = seastar::make_lw_shared<std::multimap<hobject_t,
+ std::pair<shard_id_t,eversion_t>>>();
+ auto&& [objects, next] = co_await backend->list_objects(start, max);
+ co_await interruptor::parallel_for_each(objects,
+ seastar::coroutine::lambda([FNAME, this, version_map, backfill_targets]
+ (const hobject_t& object) -> interruptible_future<> {
+ DEBUGDPP("querying obj:{}", pg, object);
+ auto obc_manager = pg.obc_loader.get_obc_manager(object);
+ co_await pg.obc_loader.load_and_lock(
+ obc_manager, RWState::RWREAD
+ ).handle_error_interruptible(
+ crimson::ct_error::assert_all("unexpected error")
+ );
+
+ if (obc_manager.get_obc()->obs.exists) {
+ auto version = obc_manager.get_obc()->obs.oi.version;
+ auto shard_versions = obc_manager.get_obc()->obs.oi.shard_versions;
+ if (shard_versions.empty()) {
+ version_map->emplace(object, std::make_pair(shard_id_t::NO_SHARD,
+ version));
+ } else {
+ bool added_default = false;
+ for (auto & shard: backfill_targets) {
+ if (shard_versions.contains(shard.shard)) {
+ version = shard_versions.at(shard.shard);
+ version_map->emplace(object, std::make_pair(shard.shard, version));
+ } else if (!added_default) {
+ version_map->emplace(object, std::make_pair(shard_id_t::NO_SHARD,
+ version));
+ added_default = true;
+ }
+ }
+ }
+ DEBUGDPP("found: {} {}", pg,
+ object, version);
+ co_return;
+ } else {
+ // if the object does not exist here, it must have been removed
+ // between the collection_list_partial and here. This can happen
+ // for the first item in the range, which is usually last_backfill.
+ co_return;
+ }
+ }));
+ PrimaryBackfillInterval bi;
+ bi.begin = std::move(start);
+ bi.end = std::move(next);
+ bi.objects = std::move(*version_map);
+ DEBUGDPP("{} PrimaryBackfillInterval filled, leaving, {}",
+ "scan_for_backfill_primary",
+ pg, bi);
+ co_return std::move(bi);
+}
+
+RecoveryBackend::interruptible_future<ReplicaBackfillInterval>
+RecoveryBackend::scan_for_backfill_replica(
const hobject_t start,
[[maybe_unused]] const std::int64_t min,
const std::int64_t max)
{
- LOG_PREFIX(RecoveryBackend::scan_for_backfill);
+ LOG_PREFIX(RecoveryBackend::scan_for_backfill_replica);
DEBUGDPP("starting from {}", pg, start);
- auto version_map = seastar::make_lw_shared<std::map<hobject_t, eversion_t>>();
+ auto version_map = seastar::make_lw_shared<std::map<hobject_t,
+ eversion_t>>();
auto&& [objects, next] = co_await backend->list_objects(start, max);
- co_await interruptor::parallel_for_each(objects, seastar::coroutine::lambda([FNAME, this, version_map]
+ co_await interruptor::parallel_for_each(objects,
+ seastar::coroutine::lambda([FNAME, this, version_map]
(const hobject_t& object) -> interruptible_future<> {
DEBUGDPP("querying obj:{}", pg, object);
auto obc_manager = pg.obc_loader.get_obc_manager(object);
co_return;
}
}));
- BackfillInterval bi;
+ ReplicaBackfillInterval bi;
bi.begin = std::move(start);
bi.end = std::move(next);
bi.objects = std::move(*version_map);
- DEBUGDPP("{} BackfillInterval filled, leaving, {}",
- "scan_for_backfill",
+ DEBUGDPP("{} ReplicaBackfillInterval filled, leaving, {}",
+ "scan_for_backfill_replica",
pg, bi);
co_return std::move(bi);
}
PeeringState::BackfillTooFull());
return seastar::now();
}
- return scan_for_backfill(
+ return scan_for_backfill_replica(
std::move(m.begin),
crimson::common::local_conf().get_val<std::int64_t>("osd_backfill_scan_min"),
crimson::common::local_conf().get_val<std::int64_t>("osd_backfill_scan_max")
// Check that from is in backfill_targets vector
ceph_assert(pg.is_backfill_target(m.from));
- BackfillInterval bi;
+ ReplicaBackfillInterval bi;
bi.begin = m.begin;
bi.end = m.end;
{
const hobject_t& soid,
eversion_t need) = 0;
- interruptible_future<BackfillInterval> scan_for_backfill(
+ interruptible_future<PrimaryBackfillInterval> scan_for_backfill_primary(
+ const hobject_t from,
+ std::int64_t min,
+ std::int64_t max,
+ const std::set<pg_shard_t> &backfill_targets);
+
+ interruptible_future<ReplicaBackfillInterval> scan_for_backfill_replica(
const hobject_t from,
std::int64_t min,
std::int64_t max);
scheduler/mClockScheduler.cc
PeeringState.cc
PGStateUtils.cc
- recovery_types.cc
MissingLoc.cc
osd_perf_counters.cc
ECCommonL.cc
std::set<int> probe_targets;
protected:
- BackfillInterval backfill_info;
- std::map<pg_shard_t, BackfillInterval> peer_backfill_info;
+ PrimaryBackfillInterval backfill_info;
+ std::map<pg_shard_t, ReplicaBackfillInterval> peer_backfill_info;
bool backfill_reserving;
// The primary's num_bytes and local num_bytes for this pg, only valid
return;
}
- BackfillInterval bi;
+ ReplicaBackfillInterval bi;
bi.begin = m->begin;
// No need to flush, there won't be any in progress writes occuring
// past m->begin
- scan_range(
+ scan_range_replica(
cct->_conf->osd_backfill_scan_min,
cct->_conf->osd_backfill_scan_max,
&bi,
// Check that from is in backfill_targets vector
ceph_assert(is_backfill_target(from));
- BackfillInterval& bi = peer_backfill_info[from];
+ ReplicaBackfillInterval& bi = peer_backfill_info[from];
bi.begin = m->begin;
bi.end = m->end;
auto p = m->get_data().cbegin();
for (const pg_shard_t& bt : get_backfill_targets()) {
const auto piter = peer_backfill_info.find(bt);
ceph_assert(piter != peer_backfill_info.end());
- const BackfillInterval& pbi = piter->second;
+ const ReplicaBackfillInterval& pbi = piter->second;
// See if peer has more to process
if (!pbi.extends_to_end() || !pbi.empty())
return false;
i != get_backfill_targets().end();
++i) {
pg_shard_t bt = *i;
- BackfillInterval& pbi = peer_backfill_info[bt];
+ ReplicaBackfillInterval& pbi = peer_backfill_info[bt];
dout(20) << " peer shard " << bt << " backfill " << pbi << dendl;
if (pbi.begin <= backfill_info.begin &&
i != get_backfill_targets().end();
++i) {
pg_shard_t bt = *i;
- BackfillInterval& pbi = peer_backfill_info[bt];
+ ReplicaBackfillInterval& pbi = peer_backfill_info[bt];
if (pbi.begin == check)
check_targets.insert(bt);
}
i != check_targets.end();
++i) {
pg_shard_t bt = *i;
- BackfillInterval& pbi = peer_backfill_info[bt];
+ ReplicaBackfillInterval& pbi = peer_backfill_info[bt];
ceph_assert(pbi.begin == check);
to_remove.push_back(boost::make_tuple(check, pbi.objects.begin()->second, bt));
// and we can't increment ops without requeueing ourself
// for recovery.
} else {
- eversion_t& obj_v = backfill_info.objects.begin()->second;
-
+ // Unpack versions for the object being backfilled
+ auto it = backfill_info.objects.begin();
+ const hobject_t& hoid = it->first;
+ eversion_t obj_v;
+ std::map<shard_id_t,eversion_t> versions;
+ while (it != backfill_info.objects.end() && it->first == hoid) {
+ obj_v = std::max(obj_v, it->second.second);
+ versions[it->second.first] = it->second.second;
+ ++it;
+ }
vector<pg_shard_t> need_ver_targs, missing_targs, keep_ver_targs, skip_targs;
for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin();
i != get_backfill_targets().end();
++i) {
pg_shard_t bt = *i;
- BackfillInterval& pbi = peer_backfill_info[bt];
+ ReplicaBackfillInterval& pbi = peer_backfill_info[bt];
// Find all check peers that have the wrong version
if (check == backfill_info.begin && check == pbi.begin) {
- if (pbi.objects.begin()->second != obj_v) {
+ eversion_t replicaobj_v;
+ if (versions.contains(bt.shard)) {
+ replicaobj_v = versions.at(bt.shard);
+ } else {
+ replicaobj_v = versions.at(shard_id_t::NO_SHARD);
+ }
+ if (pbi.objects.begin()->second != replicaobj_v) {
need_ver_targs.push_back(bt);
} else {
keep_ver_targs.push_back(bt);
i != check_targets.end();
++i) {
pg_shard_t bt = *i;
- BackfillInterval& pbi = peer_backfill_info[bt];
+ ReplicaBackfillInterval& pbi = peer_backfill_info[bt];
pbi.pop_front();
}
}
}
void PrimaryLogPG::update_range(
- BackfillInterval *bi,
+ PrimaryBackfillInterval *bi,
ThreadPool::TPHandle &handle)
{
int local_min = cct->_conf->osd_backfill_scan_min;
int local_max = cct->_conf->osd_backfill_scan_max;
+ const std::set<pg_shard_t>& backfill_targets = get_backfill_targets();
if (bi->version < info.log_tail) {
dout(10) << __func__<< ": bi is old, rescanning local backfill_info"
<< dendl;
bi->version = info.last_update;
- scan_range(local_min, local_max, bi, handle);
+ scan_range_primary(local_min, local_max, bi, handle, backfill_targets);
}
if (bi->version >= projected_last_update) {
if (e.is_update()) {
dout(10) << __func__ << ": " << e.soid << " updated to version "
<< e.version << dendl;
- bi->objects.erase(e.soid);
- bi->objects.insert(
- make_pair(
- e.soid,
- e.version));
+ if (e.written_shards.empty()) {
+ // Log entry updates all shards, replace all entries for e.soid
+ bi->objects.erase(e.soid);
+ bi->objects.insert(make_pair(e.soid,
+ make_pair(shard_id_t::NO_SHARD,
+ e.version)));
+ } else {
+ // Update backfill interval for shards modified by log entry
+ std::map<shard_id_t,eversion_t> versions;
+ // Create map from existing entries in backfill entry
+ const auto & [begin, end] = bi->objects.equal_range(e.soid);
+ for (const auto & entry : std::ranges::subrange(begin, end)) {
+ const auto & [shard, version] = entry.second;
+ versions[shard] = version;
+ }
+ // Update entries in map that are modified by log entry
+ bool uses_default = false;
+ for (const auto & shard : backfill_targets) {
+ if (e.is_written_shard(shard.shard)) {
+ versions.erase(shard.shard);
+ uses_default = true;
+ } else {
+ if (!versions.contains(shard.shard)) {
+ versions[shard.shard] = e.prior_version;
+ }
+ //Else: keep existing version
+ }
+ }
+ if (uses_default) {
+ versions[shard_id_t::NO_SHARD] = e.version;
+ } else {
+ versions.erase(shard_id_t::NO_SHARD);
+ }
+ // Erase and recreate backfill interval for e.soid using map
+ bi->objects.erase(e.soid);
+ for (auto & [shard, version] : versions) {
+ bi->objects.insert(make_pair(e.soid, make_pair(shard, version)));
+ }
+ }
} else if (e.is_delete()) {
dout(10) << __func__ << ": " << e.soid << " removed" << dendl;
- bi->objects.erase(e.soid);
+ bi->objects.erase(e.soid); // Erase all entries for e.soid
}
}
};
projected_log.scan_log_after(bi->version, func);
bi->version = projected_last_update;
} else {
- ceph_abort_msg("scan_range should have raised bi->version past log_tail");
+ ceph_abort_msg("scan_range_primary should have raised bi->version past log_tail");
}
}
-void PrimaryLogPG::scan_range(
- int min, int max, BackfillInterval *bi,
- ThreadPool::TPHandle &handle)
+void PrimaryLogPG::scan_range_primary(
+ int min, int max, PrimaryBackfillInterval *bi,
+ ThreadPool::TPHandle &handle,
+ const std::set<pg_shard_t> &backfill_targets)
{
ceph_assert(is_locked());
- dout(10) << "scan_range from " << bi->begin << dendl;
+ dout(10) << "scan_range_primary from " << bi->begin <<
+ " backfill_targets " << backfill_targets << dendl;
bi->clear_objects();
vector<hobject_t> ls;
for (vector<hobject_t>::iterator p = ls.begin(); p != ls.end(); ++p) {
handle.reset_tp_timeout();
- ObjectContextRef obc;
- if (is_primary())
- obc = object_contexts.lookup(*p);
+
+ ceph_assert(is_primary());
+
+ eversion_t version;
+ std::map<shard_id_t,eversion_t> shard_versions;
+ ObjectContextRef obc = object_contexts.lookup(*p);
+
if (obc) {
if (!obc->obs.exists) {
/* If the object does not exist here, it must have been removed
*/
continue;
}
- bi->objects[*p] = obc->obs.oi.version;
- dout(20) << " " << *p << " " << obc->obs.oi.version << dendl;
+ version = obc->obs.oi.version;
+ shard_versions = obc->obs.oi.shard_versions;
} else {
bufferlist bl;
int r = pgbackend->objects_get_attr(*p, OI_ATTR, &bl);
ceph_assert(r >= 0);
object_info_t oi(bl);
- bi->objects[*p] = oi.version;
- dout(20) << " " << *p << " " << oi.version << dendl;
+ version = oi.version;
+ shard_versions = oi.shard_versions;
+ }
+ dout(20) << " " << *p << " " << version << dendl;
+ if (shard_versions.empty()) {
+ bi->objects.insert(make_pair(*p, std::make_pair(shard_id_t::NO_SHARD,
+ version)));
+ } else {
+ bool added_default = false;
+ for (auto & shard: backfill_targets) {
+ if (shard_versions.contains(shard.shard)) {
+ version = shard_versions.at(shard.shard);
+ bi->objects.insert(make_pair(*p, std::make_pair(shard.shard,
+ version)));
+ } else if (!added_default) {
+ bi->objects.insert(make_pair(*p, std::make_pair(shard_id_t::NO_SHARD,
+ version)));
+ added_default = true;
+ }
+ }
}
}
}
+void PrimaryLogPG::scan_range_replica(
+ int min, int max, ReplicaBackfillInterval *bi,
+ ThreadPool::TPHandle &handle)
+{
+ ceph_assert(is_locked());
+ dout(10) << "scan_range_replica from " << bi->begin << dendl;
+ bi->clear_objects();
+
+ vector<hobject_t> ls;
+ ls.reserve(max);
+ int r = pgbackend->objects_list_partial(bi->begin, min, max, &ls, &bi->end);
+ ceph_assert(r >= 0);
+ dout(10) << " got " << ls.size() << " items, next " << bi->end << dendl;
+ dout(20) << ls << dendl;
+
+ for (vector<hobject_t>::iterator p = ls.begin(); p != ls.end(); ++p) {
+ handle.reset_tp_timeout();
+
+ ceph_assert(!is_primary());
+ bufferlist bl;
+ int r = pgbackend->objects_get_attr(*p, OI_ATTR, &bl);
+ /* If the object does not exist here, it must have been removed
+ * between the collection_list_partial and here. This can happen
+ * for the first item in the range, which is usually last_backfill.
+ */
+ if (r == -ENOENT)
+ continue;
+
+ ceph_assert(r >= 0);
+ object_info_t oi(bl);
+ bi->objects[*p] = oi.version;
+ dout(20) << " " << *p << " " << oi.version << dendl;
+ }
+}
/** check_local
*
}
{
f->open_array_section("peer_backfill_info");
- for (std::map<pg_shard_t, BackfillInterval>::const_iterator pbi =
+ for (std::map<pg_shard_t, ReplicaBackfillInterval>::const_iterator pbi =
peer_backfill_info.begin();
pbi != peer_backfill_info.end(); ++pbi) {
f->dump_stream("osd") << pbi->first;
* @bi.begin first item should be >= this value
* @bi [out] resulting std::map of objects to eversion_t's
*/
- void scan_range(
- int min, int max, BackfillInterval *bi,
+ void scan_range_replica(
+ int min, int max, ReplicaBackfillInterval *bi,
ThreadPool::TPHandle &handle
);
+ void scan_range_primary(
+ int min, int max, PrimaryBackfillInterval *bi,
+ ThreadPool::TPHandle &handle,
+ const std::set<pg_shard_t> &backfill_targets
+ );
+
/// Update a hash range to reflect changes since the last scan
void update_range(
- BackfillInterval *bi, ///< [in,out] interval to update
+ PrimaryBackfillInterval *bi, ///< [in,out] interval to update
ThreadPool::TPHandle &handle ///< [in] tp handle
);
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-
-#include "recovery_types.h"
-
-std::ostream& operator<<(std::ostream& out, const BackfillInterval& bi)
-{
- out << "BackfillInfo(" << bi.begin << "-" << bi.end
- << " " << bi.objects.size() << " objects";
- if (!bi.objects.empty())
- out << " " << bi.objects;
- out << ")";
- return out;
-}
-
-
* Possible states:
* 1) begin == end == hobject_t() indicates the the interval is unpopulated
* 2) Else, objects contains all objects in [begin, end)
+ *
+ * ReplicaBackfillInterval
+ *
+ * Stores a map of hobject_t and eversion to track the version number of
+ * the objects being backfilled in an interval for one specific shard
+ *
+ * PrimaryBackfillInterval
+ *
+ * Stores a multimap of hobject and pair<shard_id_t, eversion>.
+ *
+ * Only shards that are backfill targets will be tracked. For replicated and
+ * non-optimized EC pools there is one entry per hobject_t and shard_id_t will
+ * be NO_SHARD.
+ *
+ * For optimized EC pools partial writes mean it is possible that different
+ * shards have different eversions, hence there may be multiple entries per
+ * hobject_t. To conserve memory it is permitted to have an entry for NO_SHARD
+ * and additional entries for the same hobject for specific shards. In this
+ * case shards that are not specifically listed are expected to be at the
+ * eversion for the NO_SHARD entry.
+ *
+ * Example: EC pool with 4+2 profile
+ *
+ * test:head, <NO_SHARD, 1'23>
+ * test:head, <1, 1'20>
+ *
+ * Shards 0 and 2-5 are expected to be at version 1'23, shard 1 has skipped
+ * recent updates and is expected to be at version 1'20
*/
-struct BackfillInterval {
+
+template <typename T>
+class BackfillInterval {
+public:
// info about a backfill interval on a peer
eversion_t version; /// version at which the scan occurred
- std::map<hobject_t,eversion_t> objects;
hobject_t begin;
hobject_t end;
+ T objects;
+
+ virtual ~BackfillInterval() = default;
+ BackfillInterval() = default;
+ BackfillInterval(const BackfillInterval&) = default;
+ BackfillInterval(BackfillInterval&&) = default;
+ BackfillInterval& operator=(const BackfillInterval&) = default;
+ BackfillInterval& operator=(BackfillInterval&&) = default;
/// clear content
- void clear() {
- *this = BackfillInterval();
- }
+ virtual void clear() = 0;
/// clear objects std::list only
void clear_objects() {
/// Adjusts begin to the first object
void trim() {
- if (!objects.empty())
+ if (!objects.empty()) {
begin = objects.begin()->first;
- else
+ } else {
begin = end;
+ }
+ }
+
+ /// drop first entry, and adjust @begin accordingly
+ virtual void pop_front() = 0;
+
+ /// dump
+ virtual void dump(ceph::Formatter *f) const = 0;
+};
+
+class PrimaryBackfillInterval: public BackfillInterval<std::multimap<hobject_t,
+ std::pair<shard_id_t, eversion_t>>> {
+public:
+
+ /// clear content
+ void clear() override {
+ *this = PrimaryBackfillInterval();
+ }
+
+ /// drop first entry, and adjust @begin accordingly
+ void pop_front() override {
+ ceph_assert(!objects.empty());
+ // Use erase(key) to erase all entries for key
+ objects.erase(objects.begin()->first);
+ trim();
+ }
+
+ /// dump
+ void dump(ceph::Formatter *f) const override {
+ f->dump_stream("begin") << begin;
+ f->dump_stream("end") << end;
+ f->open_array_section("objects");
+ for (const auto& [hoid, shard_version] : objects) {
+ const auto& [shard, version] = shard_version;
+ f->open_object_section("object");
+ f->dump_stream("object") << hoid;
+ f->dump_stream("shard") << shard;
+ f->dump_stream("version") << version;
+ f->close_section();
+ }
+ f->close_section();
+ }
+};
+
+class ReplicaBackfillInterval: public BackfillInterval<std::map<hobject_t,
+ eversion_t>> {
+public:
+ /// clear content
+ void clear() override {
+ *this = ReplicaBackfillInterval();
}
/// drop first entry, and adjust @begin accordingly
}
/// dump
- void dump(ceph::Formatter *f) const {
+ void dump(ceph::Formatter *f) const override {
f->dump_stream("begin") << begin;
f->dump_stream("end") << end;
f->open_array_section("objects");
- for (std::map<hobject_t, eversion_t>::const_iterator i =
- objects.begin();
- i != objects.end();
- ++i) {
+ for (const auto& [hoid, version] : objects) {
f->open_object_section("object");
- f->dump_stream("object") << i->first;
- f->dump_stream("version") << i->second;
+ f->dump_stream("object") << hoid;
+ f->dump_stream("version") << version;
f->close_section();
}
f->close_section();
}
};
-std::ostream &operator<<(std::ostream &out, const BackfillInterval &bi);
+template<typename T> std::ostream& operator<<(std::ostream& out,
+ const BackfillInterval<T>& bi)
+{
+ out << "BackfillInfo(" << bi.begin << "-" << bi.end << " ";
+ if (!bi.objects.empty()) {
+ out << bi.objects.size() << " objects " << bi.objects;
+ }
+ out << ")";
+ return out;
+}
#if FMT_VERSION >= 90000
-template <> struct fmt::formatter<BackfillInterval> : fmt::ostream_formatter {};
+template <> struct fmt::formatter<PrimaryBackfillInterval> : fmt::ostream_formatter {};
+template <> struct fmt::formatter<ReplicaBackfillInterval> : fmt::ostream_formatter {};
#endif
add_executable(unittest-crimson-backfill
test_backfill.cc
${PROJECT_SOURCE_DIR}/src/auth/Crypto.cc
- ${PROJECT_SOURCE_DIR}/src/crimson/osd/backfill_state.cc
- ${PROJECT_SOURCE_DIR}/src/osd/recovery_types.cc)
+ ${PROJECT_SOURCE_DIR}/src/crimson/osd/backfill_state.cc)
add_ceph_unittest(unittest-crimson-backfill
--memory 256M --smp 1)
-target_link_libraries(unittest-crimson-backfill crimson GTest::Main)
+target_link_libraries(unittest-crimson-backfill crimson GTest::Main Boost::MPL)
add_executable(unittest-seastar-buffer
test_buffer.cc)
#include <string>
#include <boost/statechart/event_base.hpp>
+
#include <gmock/gmock.h>
#include <gtest/gtest.h>
: hobject_t::get_max();
}
+ // Permit rhs (reference) objects to be the same version or 1 version older
+ bool looks_like(const FakeStore& rhs) const {
+ if (std::size(objs) != std::size(rhs.objs)) {
+ return false;
+ }
+ for (auto &[obj, version] : objs) {
+ if (!rhs.objs.contains(obj)) {
+ return false;
+ }
+ auto version_r = rhs.objs.at(obj);
+ if ((version.epoch != version_r.epoch) ||
+ ((version.version != version_r.version) &&
+ (version.version != version_r.version + 1)))
+ {
+ return false;
+ }
+ }
+ return true;
+ }
+
bool operator==(const FakeStore& rhs) const {
return std::size(objs) == std::size(rhs.objs) && \
std::equal(std::begin(objs), std::end(objs), std::begin(rhs.objs));
}
+
bool operator!=(const FakeStore& rhs) const {
return !(*this == rhs);
}
eversion_t projected_last_update;
eversion_t log_tail;
PGLog pg_log;
+ pg_pool_t pool;
PGLog::IndexedLog projected_log;
FakePrimary(FakeStore&& store)
const bool all_replica_match = std::all_of(
std::begin(backfill_targets), std::end(backfill_targets),
[&reference] (const auto kv) {
- return kv.second.store == reference;
+ return kv.second.store.looks_like(reference);
});
return backfill_source.store == reference && all_replica_match;
}
return backfill_source.pg_log;
}
+ const pg_pool_t& get_pool() const override {
+ return backfill_source.pool;
+ }
+
void scan_log_after(eversion_t, scan_log_func_t) const override {
/* NOP */
}
const hobject_t& begin,
const hobject_t& end)
{
- BackfillInterval bi;
+ ReplicaBackfillInterval bi;
bi.end = backfill_targets.at(target).store.list(begin, [&bi](auto kv) {
bi.objects.insert(std::move(kv));
});
void BackfillFixture::request_primary_scan(
const hobject_t& begin)
{
- BackfillInterval bi;
+ PrimaryBackfillInterval bi;
bi.end = backfill_source.store.list(begin, [&bi](auto kv) {
- bi.objects.insert(std::move(kv));
+ auto && [hoid,version] = kv;
+ eversion_t version_zero;
+ eversion_t version_next = eversion_t(version.epoch, version.version + 1);
+ switch (std::rand() % 4) {
+ case 0:
+ // All shards at same version (Replica, EC, optimized EC after full-stripe write)
+ bi.objects.insert(std::make_pair(hoid, std::make_pair(shard_id_t::NO_SHARD, version)));
+ break;
+ case 1:
+ // Optimized EC partial write - Shard 3 at an earlier version
+ bi.objects.insert(std::make_pair(hoid, std::make_pair(shard_id_t(3), version_zero)));
+ bi.objects.insert(std::make_pair(hoid, std::make_pair(shard_id_t::NO_SHARD, version)));
+ break;
+ case 2:
+ // Optimized EC partial write - Shard 1 and 2 at an earlier version
+ bi.objects.insert(std::make_pair(hoid, std::make_pair(shard_id_t::NO_SHARD, version_next)));
+ bi.objects.insert(std::make_pair(hoid, std::make_pair(shard_id_t(1), version)));
+ bi.objects.insert(std::make_pair(hoid, std::make_pair(shard_id_t(2), version)));
+ break;
+ case 3:
+ // Optimized EC partial write - Shard 1, 2 and 3 at different earlier versions
+ bi.objects.insert(std::make_pair(hoid, std::make_pair(shard_id_t::NO_SHARD, version_next)));
+ bi.objects.insert(std::make_pair(hoid, std::make_pair(shard_id_t(1), version)));
+ bi.objects.insert(std::make_pair(hoid, std::make_pair(shard_id_t(2), version)));
+ bi.objects.insert(std::make_pair(hoid, std::make_pair(shard_id_t(3), version_zero)));
+ break;
+ }
});
bi.begin = begin;
bi.version = backfill_source.last_update;
BackfillFixtureBuilder&& add_target(FakeStore::objs_t objs) && {
const auto new_osd_num = std::size(backfill_targets);
+ const auto new_shard_id = shard_id_t(1 + new_osd_num);
const auto [ _, inserted ] = backfill_targets.emplace(
- new_osd_num, FakeReplica{ FakeStore{std::move(objs)} });
+ pg_shard_t(new_osd_num, new_shard_id),
+ FakeReplica{ FakeStore{std::move(objs)} });
ceph_assert(inserted);
return std::move(*this);
}
EXPECT_TRUE(cluster_fixture.all_stores_look_like(reference_store));
}
+TEST(backfill, one_same_one_empty_replica)
+{
+ const auto reference_store = FakeStore{ {
+ { "1:00058bcc:::rbd_data.1018ac3e755.00000000000000d5:head", {10, 234} },
+ { "1:00ed7f8e:::rbd_data.1018ac3e755.00000000000000af:head", {10, 196} },
+ { "1:01483aea:::rbd_data.1018ac3e755.0000000000000095:head", {10, 169} },
+ }};
+ auto cluster_fixture = BackfillFixtureBuilder::add_source(
+ reference_store.objs
+ ).add_target(
+ reference_store.objs
+ ).add_target(
+ { /* nothing 2 */ }
+ ).get_result();
+
+ EXPECT_CALL(cluster_fixture, backfilled);
+ cluster_fixture.next_till_done();
+
+ EXPECT_TRUE(cluster_fixture.all_stores_look_like(reference_store));
+}
+
TEST(backfill, two_empty_replicas)
{
const auto reference_store = FakeStore{ {
EXPECT_TRUE(cluster_fixture.all_stores_look_like(reference_store));
}
+TEST(backfill, one_behind_one_empty_replica)
+{
+ const auto reference_store = FakeStore{ {
+ { "1:00058bcc:::rbd_data.1018ac3e755.00000000000000d5:head", {8, 234} },
+ { "1:00ed7f8e:::rbd_data.1018ac3e755.00000000000000af:head", {10, 250} },
+ { "1:01483aea:::rbd_data.1018ac3e755.0000000000000095:head", {10, 247} },
+ //"1:0256710c:::rbd_data.1018ac3e755.00000000000000b1:head", deleted
+ }};
+ const auto behind_store = FakeStore{ {
+ { "1:00058bcc:::rbd_data.1018ac3e755.00000000000000d5:head", {8, 234} },
+ //"1:00ed7f8e:::rbd_data.1018ac3e755.00000000000000af:head" missing
+ { "1:01483aea:::rbd_data.1018ac3e755.0000000000000095:head", {8, 165} },
+ { "1:0256710c:::rbd_data.1018ac3e755.00000000000000b1:head", {8, 169} },
+ }};
+ auto cluster_fixture = BackfillFixtureBuilder::add_source(
+ reference_store.objs
+ ).add_target(
+ { /* nothing 1 */ }
+ ).add_target(
+ behind_store.objs
+ ).get_result();
+
+ EXPECT_CALL(cluster_fixture, backfilled);
+ cluster_fixture.next_till_done();
+
+ EXPECT_TRUE(cluster_fixture.all_stores_look_like(reference_store));
+}
+
TEST(backfill, cancel_resume_middle_of_primaryscan)
{
const auto reference_store = FakeStore{ {