Summary:
WaitReplica submits two asyncronous events through the OSD scheduler
concurrently: DigestUpdate and MapsCompared. This causes the error
described in https://tracker.ceph.com/issues/59049 because MapsCompared
must be processed first, but they may be scheduled with differing
priorities depending on whether there are writes blocked. This patch
avoids the problem by simply processing MapsCompared syncronously.
Details:
Within the WaitReplicas state handler for GotReplicas, we invoke
PGScrubber::maps_compare_n_cleanup:
sc::result WaitReplicas::react(const GotReplicas&)
{
...
// maps_compare_n_cleanup() will arrange for MapsCompared event to be
// sent:
scrbr->maps_compare_n_cleanup();
return discard_event();
void PgScrubber::maps_compare_n_cleanup()
{
...
auto required_fixes =
m_be->scrub_compare_maps(
m_end.is_max(),
m_listener->sl_get_snap_map_reader());
...
m_osds->queue_scrub_maps_compared(m_pg, Scrub::scrub_prio_t::low_priority);
PgScrubber::maps_compare_n_cleanup arranges for the MapsCompared event
to be submitted through the OSD schdeduler to be delivered to the scrub
state machine later. It also invokes scrub_compare_maps
objs_fix_list_t ScrubBackend::scrub_compare_maps(
bool max_reached,
SnapMapReaderI& snaps_getter)
{
...
scrub_snapshot_metadata(for_meta_scrub);
which in turn invokes scrub_snapshot_metadata
void ScrubBackend::scrub_snapshot_metadata(ScrubMap& map)
{
...
m_scrubber.submit_digest_fixes(this_chunk->missing_digest);
which in turn invokes submit_digest_fixes
void PrimaryLogScrub::submit_digest_fixes(const digests_fixes_t& fixes)
{
...
for (auto& [obj, dgs] : fixes) {
...
PrimaryLogPG::OpContextUPtr ctx = m_pl_pg->simple_opc_create(obc);
...
m_pl_pg->finish_ctx(ctx.get(), pg_log_entry_t::MODIFY);
...
ctx->register_on_success([this]() {
if ((num_digest_updates_pending >= 1) &&
(--num_digest_updates_pending == 0)) {
m_osds->queue_scrub_digest_update(m_pl_pg,
m_pl_pg->is_scrub_blocking_ops());
}
});
m_pl_pg->simple_opc_submit(std::move(ctx));
}
}
which submits a sequence of repops updating objects' digests and
schduling the DigestUpdate event once they have completed.
These two events, DigestUpdate and MapsCompared, are therefore in flight
concurrently. However, only one event ordering is actually tolerated --
we must process MapsCompared first in order to be in state
WaitDigestUpdate when the DigestUpdate event arrives. This would be
fine except that the priority at which we submit the DigestUpdate event
may end up with a higher priority if there is currently an op blocked.
There are two ways we could solve this. We could ensure that the two
are always queued at the same priority and therefore will be processed
in order. However, there's actually no reason to process the
MapsCompared event asyncronously -- it would be far less brittle to
simply process the MapsCompared syncronously and avoid this problem
entirely.
Fixes: https://tracker.ceph.com/issues/59049
Signed-off-by: Samuel Just <sjust@redhat.com>
(cherry picked from commit
604860e54098b20ea65785dbcf7553dd50b22532)
queue_scrub_event_msg<PGScrubGotReplMaps>(pg, with_priority);
}
-void OSDService::queue_scrub_maps_compared(PG* pg, Scrub::scrub_prio_t with_priority)
-{
- // Resulting scrub event: 'MapsCompared'
- queue_scrub_event_msg<PGScrubMapsCompared>(pg, with_priority);
-}
-
void OSDService::queue_scrub_replica_pushes(PG *pg, Scrub::scrub_prio_t with_priority)
{
// Resulting scrub event: 'ReplicaPushesUpd'
/// Signals that there are more chunks to handle
void queue_scrub_next_chunk(PG* pg, Scrub::scrub_prio_t with_priority);
- /// Signals that we have finished comparing the maps for this chunk
- /// Note: required, as in Crimson this operation is 'futurized'.
- void queue_scrub_maps_compared(PG* pg, Scrub::scrub_prio_t with_priority);
-
void queue_for_rep_scrub(PG* pg,
Scrub::scrub_prio_t with_high_priority,
unsigned int qu_priority,
"ReplicaPushesUpd");
}
- void scrub_send_maps_compared(epoch_t queued, ThreadPool::TPHandle& handle)
- {
- forward_scrub_event(&ScrubPgIF::send_maps_compared, queued, "MapsCompared");
- }
-
void scrub_send_get_next_chunk(epoch_t queued, ThreadPool::TPHandle& handle)
{
forward_scrub_event(&ScrubPgIF::send_get_next_chunk, queued, "NextChunk");
pg->unlock();
}
-void PGScrubMapsCompared::run(OSD* osd,
- OSDShard* sdata,
- PGRef& pg,
- ThreadPool::TPHandle& handle)
-{
- pg->scrub_send_maps_compared(epoch_queued, handle);
- pg->unlock();
-}
-
void PGRepScrub::run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle)
{
pg->replica_scrub(epoch_queued, activation_index, handle);
void run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle) final;
};
-class PGScrubMapsCompared : public PGScrubItem {
- public:
- PGScrubMapsCompared(spg_t pg, epoch_t epoch_queued)
- : PGScrubItem{pg, epoch_queued, "PGScrubMapsCompared"}
- {}
- void run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle) final;
-};
-
class PGRepScrub : public PGScrubItem {
public:
PGRepScrub(spg_t pg, epoch_t epoch_queued, Scrub::act_token_t op_token)
dout(10) << "scrubber event --<< " << __func__ << dendl;
}
-void PgScrubber::send_maps_compared(epoch_t epoch_queued)
-{
- dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued
- << dendl;
-
- m_fsm->process_event(Scrub::MapsCompared{});
-
- dout(10) << "scrubber event --<< " << __func__ << dendl;
-}
-
// -----------------
bool PgScrubber::is_reserving() const
// requeue the writes from the chunk that just finished
requeue_waiting();
- m_osds->queue_scrub_maps_compared(m_pg, Scrub::scrub_prio_t::low_priority);
}
Scrub::preemption_t& PgScrubber::get_preemptor()
void send_local_map_done(epoch_t epoch_queued) final;
- void send_maps_compared(epoch_t epoch_queued) final;
-
void send_get_next_chunk(epoch_t epoch_queued) final;
void send_scrub_is_finished(epoch_t epoch_queued) final;
return transit<PendingTimer>();
} else {
-
- // maps_compare_n_cleanup() will arrange for MapsCompared event to be
- // sent:
scrbr->maps_compare_n_cleanup();
- return discard_event();
+ return transit<WaitDigestUpdate>();
}
} else {
return discard_event();
/// scrub_snapshot_metadata()
MEV(DigestUpdate)
-/// maps_compare_n_cleanup() transactions are done
-MEV(MapsCompared)
-
/// initiating replica scrub
MEV(StartReplica)
using reactions = mpl::list<
// all replicas are accounted for:
sc::custom_reaction<GotReplicas>,
- sc::transition<MapsCompared, WaitDigestUpdate>,
sc::custom_reaction<DigestUpdate>>;
sc::result react(const GotReplicas&);
virtual void send_scrub_is_finished(epoch_t epoch_queued) = 0;
- virtual void send_maps_compared(epoch_t epoch_queued) = 0;
-
virtual void on_applied_when_primary(const eversion_t& applied_version) = 0;
// --------------------------------------------------