pg->scrub_requeue_priority(with_priority), ceph_clock_now(), 0, epoch));
}
-void OSDService::queue_for_scrub(PG *pg, bool with_high_priority)
+void OSDService::queue_for_scrub(PG* pg, Scrub::scrub_prio_t with_priority)
{
- unsigned scrub_queue_priority = pg->scrubber.priority;
- if (with_high_priority && scrub_queue_priority < cct->_conf->osd_client_op_priority) {
- scrub_queue_priority = cct->_conf->osd_client_op_priority;
- }
- const auto epoch = pg->get_osdmap_epoch();
- enqueue_back(
- OpSchedulerItem(
- unique_ptr<OpSchedulerItem::OpQueueable>(new PGScrub(pg->get_pgid(), epoch)),
- cct->_conf->osd_scrub_cost,
- scrub_queue_priority,
- ceph_clock_now(),
- 0,
- epoch));
+ queue_scrub_event_msg<PGScrub>(pg, with_priority);
+}
+
+void OSDService::queue_scrub_after_repair(PG* pg, Scrub::scrub_prio_t with_priority)
+{
+ queue_scrub_event_msg<PGScrubAfterRepair>(pg, with_priority);
+}
+
+void OSDService::queue_for_rep_scrub(PG* pg,
+ Scrub::scrub_prio_t with_priority,
+ unsigned int qu_priority)
+{
+ queue_scrub_event_msg<PGRepScrub>(pg, with_priority, qu_priority);
+}
+
+void OSDService::queue_for_rep_scrub_resched(PG* pg,
+ Scrub::scrub_prio_t with_priority,
+ unsigned int qu_priority)
+{
+ // Resulting scrub event: 'SchedReplica'
+ queue_scrub_event_msg<PGRepScrubResched>(pg, with_priority, qu_priority);
}
void OSDService::queue_for_scrub_granted(PG* pg, Scrub::scrub_prio_t with_priority)
queue_scrub_event_msg<PGScrubDenied>(pg, with_priority);
}
+void OSDService::queue_for_scrub_resched(PG* pg, Scrub::scrub_prio_t with_priority)
+{
+ queue_scrub_event_msg<PGScrubResched>(pg, with_priority);
+}
+
+void OSDService::queue_scrub_pushes_update(PG* pg, Scrub::scrub_prio_t with_priority)
+{
+ // Resulting scrub event: 'ActivePushesUpd'
+ queue_scrub_event_msg<PGScrubPushesUpdate>(pg, with_priority);
+}
+
+void OSDService::queue_scrub_applied_update(PG* pg, Scrub::scrub_prio_t with_priority)
+{
+ queue_scrub_event_msg<PGScrubAppliedUpdate>(pg, with_priority);
+}
+
+void OSDService::queue_scrub_unblocking(PG* pg, Scrub::scrub_prio_t with_priority)
+{
+ // Resulting scrub event: 'Unblocked'
+ queue_scrub_event_msg<PGScrubUnblocked>(pg, with_priority);
+}
+
+void OSDService::queue_scrub_digest_update(PG* pg, Scrub::scrub_prio_t with_priority)
+{
+ // Resulting scrub event: 'DigestUpdate'
+ queue_scrub_event_msg<PGScrubDigestUpdate>(pg, with_priority);
+}
+
+void OSDService::queue_scrub_got_repl_maps(PG* pg, Scrub::scrub_prio_t with_priority)
+{
+ // Resulting scrub event: 'GotReplicas'
+ queue_scrub_event_msg<PGScrubGotReplMaps>(pg, with_priority);
+}
+
+void OSDService::queue_scrub_replica_pushes(PG *pg, Scrub::scrub_prio_t with_priority)
+{
+ // Resulting scrub event: 'ReplicaPushesUpd'
+ queue_scrub_event_msg<PGScrubReplicaPushes>(pg, with_priority);
+}
+
void OSDService::queue_for_pg_delete(spg_t pgid, epoch_t e)
{
dout(10) << __func__ << " on " << pgid << " e " << e << dendl;
return pgid < rhs.pgid;
}
+// this one is only moved here (from the header) temporarily, for debugging:
+void OSDService::unreg_pg_scrub(spg_t pgid, utime_t t)
+{
+ std::lock_guard l{OSDService::sched_scrub_lock};
+ size_t removed = sched_scrub_pg.erase(ScrubJob{cct, pgid, t});
+ ceph_assert(removed);
+ dout(10) << __func__ << " scrub-set removed: " << pgid << " T(" << t << ")" << dendl;
+}
+
+// this one is only moved here (from the header) temporarily, for debugging:
+utime_t OSDService::reg_pg_scrub(spg_t pgid, utime_t t, double pool_scrub_min_interval,
+ double pool_scrub_max_interval, bool must)
+{
+ ScrubJob scrub_job(cct, pgid, t, pool_scrub_min_interval, pool_scrub_max_interval,
+ must);
+ std::lock_guard l(OSDService::sched_scrub_lock);
+ auto [x, inserted] = sched_scrub_pg.insert(scrub_job);
+ dout(10) << __func__ << " scrub-set inserted: " << pgid << " T(" << t << ")" << " must: " << must << " inserted "
+ << inserted << dendl;
+ return scrub_job.sched_time;
+}
+
+void OSDService::dumps_scrub(ceph::Formatter *f)
+{
+ ceph_assert(f != nullptr);
+ std::lock_guard l(sched_scrub_lock);
+
+ f->open_array_section("scrubs");
+ for (const auto &i: sched_scrub_pg) {
+ f->open_object_section("scrub");
+ f->dump_stream("pgid") << i.pgid;
+ f->dump_stream("sched_time") << i.sched_time;
+ f->dump_stream("deadline") << i.deadline;
+ f->dump_bool("forced", i.sched_time == PgScrubber::scrub_must_stamp());
+ f->close_section();
+ }
+ f->close_section();
+}
+
double OSD::scrub_sleep_time(bool must_scrub)
{
if (must_scrub) {
void OSD::sched_scrub()
{
+ dout(20) << __func__ << " sched_scrub starts" << dendl;
+
// if not permitted, fail fast
if (!service.can_inc_scrubs()) {
+ dout(20) << __func__ << ": OSD cannot inc scrubs" << dendl;
return;
}
bool allow_requested_repair_only = false;
if (service.is_recovery_active() && !cct->_conf->osd_scrub_during_recovery) {
if (!cct->_conf->osd_repair_during_recovery) {
- dout(20) << __func__ << " not scheduling scrubs due to active recovery" << dendl;
+ dout(15) << __func__ << ": not scheduling scrubs due to active recovery" << dendl;
return;
}
dout(10) << __func__
bool load_is_low = scrub_load_below_threshold();
dout(20) << "sched_scrub load_is_low=" << (int)load_is_low << dendl;
- OSDService::ScrubJob scrub;
- if (service.first_scrub_stamp(&scrub)) {
+ OSDService::ScrubJob scrub_job;
+ if (service.first_scrub_stamp(&scrub_job)) {
do {
dout(30) << "sched_scrub examine " << scrub.pgid << " at " << scrub.sched_time << dendl;
- if (scrub.sched_time > now) {
+ if (scrub_job.sched_time > now) {
// save ourselves some effort
- dout(10) << "sched_scrub " << scrub.pgid << " scheduled at " << scrub.sched_time
+ dout(20) << "sched_scrub " << scrub_job.pgid << " scheduled at " << scrub_job.sched_time
<< " > " << now << dendl;
break;
}
- if ((scrub.deadline.is_zero() || scrub.deadline >= now) && !(time_permit && load_is_low)) {
- dout(10) << __func__ << " not scheduling scrub for " << scrub.pgid << " due to "
+ if ((scrub_job.deadline.is_zero() || scrub_job.deadline >= now) && !(time_permit && load_is_low)) {
+ dout(15) << __func__ << " not scheduling scrub for " << scrub_job.pgid << " due to "
<< (!time_permit ? "time not permit" : "high load") << dendl;
continue;
}
- PGRef pg = _lookup_lock_pg(scrub.pgid);
- if (!pg)
+ PGRef pg = _lookup_lock_pg(scrub_job.pgid);
+ if (!pg) {
+ dout(20) << __func__ << " pg " << scrub_job.pgid << " not found" << dendl;
continue;
+ }
+
// This has already started, so go on to the next scrub job
- if (pg->scrubber.active) {
+ if (pg->is_scrub_active()) {
pg->unlock();
- dout(30) << __func__ << ": already in progress pgid " << scrub.pgid << dendl;
+ dout(20) << __func__ << ": already in progress pgid " << scrub_job.pgid << dendl;
continue;
}
- // Skip other kinds of scrubing if only explicitly requested repairing is allowed
- if (allow_requested_repair_only && !pg->scrubber.must_repair) {
+ // Skip other kinds of scrubbing if only explicitly requested repairing is allowed
+ if (allow_requested_repair_only && !pg->m_planned_scrub.must_repair) {
pg->unlock();
- dout(10) << __func__ << " skip " << scrub.pgid
+ dout(10) << __func__ << " skip " << scrub_job.pgid
<< " because repairing is not explicitly requested on it"
<< dendl;
continue;
}
+
// If it is reserving, let it resolve before going to the next scrub job
- if (pg->scrubber.local_reserved && !pg->scrubber.active) {
+ if (pg->m_scrubber->is_reserving()) {
pg->unlock();
- dout(30) << __func__ << ": reserve in progress pgid " << scrub.pgid << dendl;
+ dout(10) << __func__ << ": reserve in progress pgid " << scrub_job.pgid << dendl;
break;
}
- dout(10) << "sched_scrub scrubbing " << scrub.pgid << " at " << scrub.sched_time
+ dout(15) << "sched_scrub scrubbing " << scrub_job.pgid << " at " << scrub_job.sched_time
<< (pg->get_must_scrub() ? ", explicitly requested" :
(load_is_low ? ", load_is_low" : " deadline < now"))
<< dendl;
if (pg->sched_scrub()) {
pg->unlock();
+ dout(10) << __func__ << " scheduled a scrub!" << " (~" << scrub_job.pgid << "~)" << dendl;
break;
}
pg->unlock();
- } while (service.next_scrub_stamp(scrub, &scrub));
+ } while (service.next_scrub_stamp(scrub_job, &scrub_job));
}
dout(20) << "sched_scrub done" << dendl;
}
void OSD::resched_all_scrubs()
{
dout(10) << __func__ << ": start" << dendl;
- OSDService::ScrubJob scrub;
- if (service.first_scrub_stamp(&scrub)) {
+ OSDService::ScrubJob scrub_job;
+ if (service.first_scrub_stamp(&scrub_job)) {
do {
- dout(20) << __func__ << ": examine " << scrub.pgid << dendl;
+ dout(20) << __func__ << ": examine " << scrub_job.pgid << dendl;
- PGRef pg = _lookup_lock_pg(scrub.pgid);
+ PGRef pg = _lookup_lock_pg(scrub_job.pgid);
if (!pg)
continue;
- if (!pg->scrubber.must_scrub && !pg->scrubber.need_auto) {
- dout(20) << __func__ << ": reschedule " << scrub.pgid << dendl;
+ if (!pg->m_planned_scrub.must_scrub && !pg->m_planned_scrub.need_auto) {
+ dout(15) << __func__ << ": reschedule " << scrub_job.pgid << dendl;
pg->on_info_history_change();
}
pg->unlock();
- } while (service.next_scrub_stamp(scrub, &scrub));
+ } while (service.next_scrub_stamp(scrub_job, &scrub_job));
}
dout(10) << __func__ << ": done" << dendl;
}
};
std::set<ScrubJob> sched_scrub_pg;
- /// @returns the scrub_reg_stamp used for unregister the scrub job
+ /// @returns the scrub_reg_stamp used for unregister'ing the scrub job
utime_t reg_pg_scrub(spg_t pgid, utime_t t, double pool_scrub_min_interval,
- double pool_scrub_max_interval, bool must) {
- ScrubJob scrub(cct, pgid, t, pool_scrub_min_interval, pool_scrub_max_interval,
- must);
- std::lock_guard l(sched_scrub_lock);
- sched_scrub_pg.insert(scrub);
- return scrub.sched_time;
- }
- void unreg_pg_scrub(spg_t pgid, utime_t t) {
- std::lock_guard l(sched_scrub_lock);
- size_t removed = sched_scrub_pg.erase(ScrubJob(cct, pgid, t));
- ceph_assert(removed);
- }
+ double pool_scrub_max_interval, bool must);
+ void unreg_pg_scrub(spg_t pgid, utime_t t);
bool first_scrub_stamp(ScrubJob *out) {
std::lock_guard l(sched_scrub_lock);
if (sched_scrub_pg.empty())
return true;
}
- void dumps_scrub(ceph::Formatter *f) {
- ceph_assert(f != nullptr);
- std::lock_guard l(sched_scrub_lock);
-
- f->open_array_section("scrubs");
- for (const auto &i: sched_scrub_pg) {
- f->open_object_section("scrub");
- f->dump_stream("pgid") << i.pgid;
- f->dump_stream("sched_time") << i.sched_time;
- f->dump_stream("deadline") << i.deadline;
- f->dump_bool("forced", i.sched_time == PG::Scrubber::scrub_must_stamp());
- f->close_section();
- }
- f->close_section();
- }
+ void dumps_scrub(ceph::Formatter* f);
bool can_inc_scrubs();
bool inc_scrubs_local();
AsyncReserver<spg_t, Finisher> snap_reserver;
void queue_recovery_context(PG *pg, GenContext<ThreadPool::TPHandle&> *c);
void queue_for_snap_trim(PG *pg);
- void queue_for_scrub(PG *pg, bool with_high_priority);
+ void queue_for_scrub(PG* pg, Scrub::scrub_prio_t with_priority);
+ void queue_scrub_after_repair(PG* pg, Scrub::scrub_prio_t with_priority);
/// queue the message (-> event) that all replicas reserved scrub resources for us
void queue_for_scrub_granted(PG* pg, Scrub::scrub_prio_t with_priority);
/// queue the message (-> event) that some replicas denied our scrub resources request
void queue_for_scrub_denied(PG* pg, Scrub::scrub_prio_t with_priority);
+ /// Signals either (a) the end of a sleep period, or (b) a recheck of the availability
+ /// of the primary map being created by the backend.
+ void queue_for_scrub_resched(PG* pg, Scrub::scrub_prio_t with_priority);
+
+ /// Signals a change in the number of in-flight recovery writes
+ void queue_scrub_pushes_update(PG* pg, Scrub::scrub_prio_t with_priority);
+
+ /// Signals that all pending updates were applied
+ void queue_scrub_applied_update(PG* pg, Scrub::scrub_prio_t with_priority);
+
+ /// The block-range that was locked and prevented the scrubbing - is freed
+ void queue_scrub_unblocking(PG* pg, Scrub::scrub_prio_t with_priority);
+
+ /// Signals that all write OPs are done
+ void queue_scrub_digest_update(PG* pg, Scrub::scrub_prio_t with_priority);
+
+ /// Signals that we (the Primary) got all waited-for scrub-maps from our replicas
+ void queue_scrub_got_repl_maps(PG* pg, Scrub::scrub_prio_t with_priority);
+
+ void queue_for_rep_scrub(PG* pg,
+ Scrub::scrub_prio_t with_high_priority,
+ unsigned int qu_priority);
+
+ /// Signals a change in the number of in-flight recovery writes
+ void queue_scrub_replica_pushes(PG *pg, Scrub::scrub_prio_t with_priority);
+
+ void queue_for_rep_scrub_resched(PG* pg,
+ Scrub::scrub_prio_t with_high_priority,
+ unsigned int qu_priority);
+
void queue_for_pg_delete(spg_t pgid, epoch_t e);
bool try_finish_pg_delete(PG *pg, unsigned old_pg_num);
std::list<std::pair<epoch_t, PGRef> > awaiting_throttle;
/// queue a scrub-related message for a PG
- template<class MSG_TYPE>
- void queue_scrub_event_msg(PG* pg, Scrub::scrub_prio_t with_priority, unsigned int qu_priority);
+ template <class MSG_TYPE>
+ void queue_scrub_event_msg(PG* pg,
+ Scrub::scrub_prio_t with_priority,
+ unsigned int qu_priority);
/// An alternative version of queue_scrub_event_msg(), in which the queuing priority is
/// provided by the executing scrub (i.e. taken from PgScrubber::m_flags)
- template<class MSG_TYPE>
+ template <class MSG_TYPE>
void queue_scrub_event_msg(PG* pg, Scrub::scrub_prio_t with_priority);
utime_t defer_recovery_until;
friend class PG;
friend struct OSDShard;
friend class PrimaryLogPG;
+ friend class PgScrubber;
protected:
pg_stats_publish_valid(false),
finish_sync_event(NULL),
scrub_after_recovery(false),
- save_req_scrub(false),
active_pushes(0),
recovery_state(
o->cct,
osd->pg_recovery_stats.log_exit(
state_name, ceph_clock_now() - enter_time, events, event_dur);
}
-
+
/********* PG **********/
void PG::remove_snap_mapped_object(
finish_sync_event = 0; // so that _finish_recovery doesn't go off in another thread
release_pg_backoffs();
- scrubber.reserved_peers.clear();
+ m_scrubber->unreserve_replicas();
scrub_after_recovery = false;
- save_req_scrub = false;
agent_clear();
}
-PG::Scrubber::Scrubber()
- : local_reserved(false), remote_reserved(false), reserve_failed(false),
- epoch_start(0),
- active(false),
- shallow_errors(0), deep_errors(0), fixed(0),
- must_scrub(false), must_deep_scrub(false), must_repair(false),
- need_auto(false), req_scrub(false), time_for_deep(false),
- auto_repair(false),
- check_repair(false),
- deep_scrub_on_error(false),
- num_digest_updates_pending(0),
- state(INACTIVE),
- deep(false)
-{}
-
-PG::Scrubber::~Scrubber() {}
bool PG::op_has_sufficient_caps(OpRequestRef& op)
{
return cap;
}
-bool PG::requeue_scrub(bool high_priority)
-{
- ceph_assert(ceph_mutex_is_locked(_lock));
- if (scrub_queued) {
- dout(10) << __func__ << ": already queued" << dendl;
- return false;
- } else {
- dout(10) << __func__ << ": queueing" << dendl;
- scrub_queued = true;
- osd->queue_for_scrub(this, high_priority);
- return true;
- }
-}
-
void PG::queue_recovery()
{
if (!is_primary() || !is_peered()) {
}
}
-bool PG::queue_scrub()
+void PG::queue_scrub_after_repair()
{
+ dout(10) << __func__ << dendl;
ceph_assert(ceph_mutex_is_locked(_lock));
+
+ m_planned_scrub.must_deep_scrub = true;
+ m_planned_scrub.check_repair = true;
+ m_planned_scrub.must_scrub = true;
+
if (is_scrubbing()) {
- return false;
- }
- // An interrupted recovery repair could leave this set.
- state_clear(PG_STATE_REPAIR);
- if (scrubber.need_auto) {
- scrubber.must_scrub = true;
- scrubber.must_deep_scrub = true;
- scrubber.auto_repair = true;
- scrubber.need_auto = false;
- }
- scrubber.priority = scrubber.must_scrub ?
- cct->_conf->osd_requested_scrub_priority : get_scrub_priority();
- scrubber.must_scrub = false;
- state_set(PG_STATE_SCRUBBING);
- if (scrubber.must_deep_scrub) {
- state_set(PG_STATE_DEEP_SCRUB);
- scrubber.must_deep_scrub = false;
+ dout(10) << __func__ << ": scrubbing already" << dendl;
+ return;
}
- if (scrubber.must_repair || scrubber.auto_repair) {
- state_set(PG_STATE_REPAIR);
- scrubber.must_repair = false;
+ if (scrub_queued) {
+ dout(10) << __func__ << ": already queued" << dendl;
+ return;
}
- requeue_scrub();
- return true;
-}
-void PG::scrub_send_resources_granted(epoch_t epoch_queued,
- [[maybe_unused]] ThreadPool::TPHandle& handle)
-{
- dout(10) << __func__ << " queued at: " << epoch_queued << dendl;
- //m_scrubber->send_remotes_reserved();
-}
+ m_scrubber->set_op_parameters(m_planned_scrub);
+ dout(15) << __func__ << ": queueing" << dendl;
-void PG::scrub_send_resources_denied(epoch_t epoch_queued,
- [[maybe_unused]] ThreadPool::TPHandle& handle)
-{
- dout(10) << __func__ << " queued at: " << epoch_queued << dendl;
- //m_scrubber->send_reservation_failure();
+ scrub_queued = true;
+ osd->queue_scrub_after_repair(this, Scrub::scrub_prio_t::high_priority);
}
unsigned PG::get_scrub_priority()
{
// a higher value -> a higher priority
- int64_t pool_scrub_priority = 0;
- pool.info.opts.get(pool_opts_t::SCRUB_PRIORITY, &pool_scrub_priority);
+ int64_t pool_scrub_priority =
+ pool.info.opts.value_or(pool_opts_t::SCRUB_PRIORITY, (int64_t)0);
return pool_scrub_priority > 0 ? pool_scrub_priority : cct->_conf->osd_scrub_priority;
}
return finish_sync_event;
}
-void PG::_finish_recovery(Context *c)
+void PG::_finish_recovery(Context* c)
{
+ dout(15) << __func__ << " finish_sync_event? " << finish_sync_event << " clean? "
+ << is_clean() << dendl;
+
std::scoped_lock locker{*this};
if (recovery_state.is_deleting() || !is_clean()) {
dout(10) << __func__ << " raced with delete or repair" << dendl;
// When recovery is initiated by a repair, that flag is left on
state_clear(PG_STATE_REPAIR);
if (c == finish_sync_event) {
- dout(10) << "_finish_recovery" << dendl;
+ dout(15) << __func__ << " scrub_after_recovery? " << scrub_after_recovery << dendl;
finish_sync_event = 0;
recovery_state.purge_strays();
if (scrub_after_recovery) {
dout(10) << "_finish_recovery requeueing for scrub" << dendl;
scrub_after_recovery = false;
- scrubber.must_deep_scrub = true;
- scrubber.check_repair = true;
- // We remember whether req_scrub was set when scrub_after_recovery set to true
- scrubber.req_scrub = save_req_scrub;
- queue_scrub();
+ queue_scrub_after_repair();
}
} else {
dout(10) << "_finish_recovery -- stale" << dendl;
}
}
+bool PG::get_must_scrub() const
+{
+ dout(20) << __func__ << " must_scrub? " << (m_planned_scrub.must_scrub ? "true" : "false") << dendl;
+ return m_planned_scrub.must_scrub;
+}
unsigned int PG::scrub_requeue_priority(Scrub::scrub_prio_t with_priority) const
{
- return 0; // next commit: m_scrubber->scrub_requeue_priority(with_priority);
+ return m_scrubber->scrub_requeue_priority(with_priority);
}
unsigned int PG::scrub_requeue_priority(Scrub::scrub_prio_t with_priority, unsigned int suggested_priority) const
{
- return 0; // next commit: m_scrubber->scrub_requeue_priority(with_priority, suggested_priority);
+ return m_scrubber->scrub_requeue_priority(with_priority, suggested_priority);
}
// ==========================================================================================
// SCRUB
/*
- * when holding pg and sched_scrub_lock, then the states are:
- * scheduling:
- * scrubber.local_reserved = true
- * scrubber.active = false
- * scrubber.reserved_peers includes whoami
- * osd->scrubs_local++
- * scheduling, replica declined:
- * scrubber.local_reserved = true
- * scrubber.reserved_peers includes -1
- * osd->scrub_local++
- * pending:
- * scrubber.local_reserved = true
- * scrubber.active = false
- * scrubber.reserved_peers.size() == acting.size();
- * pg on scrub_wq
- * osd->scrub_local++
- * scrubbing:
- * scrubber.local_reserved = true;
- * scrubber.active = true
- * scrubber.reserved_peers empty
+ * implementation note:
+ * PG::sched_scrub() is called only once per a specific scrub session.
+ * That call commits us to the whatever choices are made (deep/shallow, etc').
+ * Unless failing to start scrubbing, the 'planned scrub' flag-set is 'frozen' into
+ * PgScrubber's m_flags, then cleared.
*/
-
-// returns true if a scrub has been newly kicked off
bool PG::sched_scrub()
{
+ dout(15) << __func__ << " pg(" << info.pgid
+ << (is_active() ? ") <active>" : ") <not-active>")
+ << (is_clean() ? " <clean>" : " <not-clean>") << dendl;
ceph_assert(ceph_mutex_is_locked(_lock));
ceph_assert(!is_scrubbing());
- if (!(is_primary() && is_active() && is_clean())) {
+
+ if (!is_primary() || !is_active() || !is_clean()) {
return false;
}
- // All processing the first time through commits us to whatever
- // choices are made.
- if (!scrubber.local_reserved) {
- dout(20) << __func__ << ": Start processing pg " << info.pgid << dendl;
-
- bool allow_deep_scrub = !(get_osdmap()->test_flag(CEPH_OSDMAP_NODEEP_SCRUB) ||
- pool.info.has_flag(pg_pool_t::FLAG_NODEEP_SCRUB));
- bool allow_scrub = !(get_osdmap()->test_flag(CEPH_OSDMAP_NOSCRUB) ||
- pool.info.has_flag(pg_pool_t::FLAG_NOSCRUB));
- bool has_deep_errors = (info.stats.stats.sum.num_deep_scrub_errors > 0);
- bool try_to_auto_repair = (cct->_conf->osd_scrub_auto_repair
- && get_pgbackend()->auto_repair_supported());
-
- scrubber.time_for_deep = false;
- // Clear these in case user issues the scrub/repair command during
- // the scheduling of the scrub/repair (e.g. request reservation)
- scrubber.deep_scrub_on_error = false;
- scrubber.auto_repair = false;
+ if (scrub_queued) {
+ // only applicable to the very first time a scrub event is queued
+ // (until handled and posted to the scrub FSM)
+ dout(10) << __func__ << ": already queued" << dendl;
+ return false;
+ }
- // All periodic scrub handling goes here because must_scrub is
- // always set for must_deep_scrub and must_repair.
- if (!scrubber.must_scrub) {
- ceph_assert(!scrubber.must_deep_scrub && !scrubber.must_repair);
- // Handle deep scrub determination only if allowed
- if (allow_deep_scrub) {
- // Initial entry and scheduled scrubs without nodeep_scrub set get here
- if (scrubber.need_auto) {
- dout(20) << __func__ << ": need repair after scrub errors" << dendl;
- scrubber.time_for_deep = true;
- } else {
- double deep_scrub_interval = 0;
- pool.info.opts.get(pool_opts_t::DEEP_SCRUB_INTERVAL, &deep_scrub_interval);
- if (deep_scrub_interval <= 0) {
- deep_scrub_interval = cct->_conf->osd_deep_scrub_interval;
- }
- scrubber.time_for_deep = ceph_clock_now() >=
- info.history.last_deep_scrub_stamp + deep_scrub_interval;
-
- bool deep_coin_flip = false;
- // If we randomize when !allow_scrub && allow_deep_scrub, then it guarantees
- // we will deep scrub because this function is called often.
- if (!scrubber.time_for_deep && allow_scrub)
- deep_coin_flip = (rand() % 100) < cct->_conf->osd_deep_scrub_randomize_ratio * 100;
- dout(20) << __func__ << ": time_for_deep=" << scrubber.time_for_deep << " deep_coin_flip=" << deep_coin_flip << dendl;
-
- scrubber.time_for_deep = (scrubber.time_for_deep || deep_coin_flip);
- }
+ // analyse the combination of the requested scrub flags, the osd/pool configuration
+ // and the PG status to determine whether we should scrub now, and what type of scrub
+ // should that be.
+ auto updated_flags = verify_scrub_mode();
+ if (!updated_flags) {
+ // the stars do not align for starting a scrub for this PG at this time
+ // (due to configuration or priority issues)
+ // The reason was already reported by the callee.
+ dout(10) << __func__ << ": failed to initiate a scrub" << dendl;
+ return false;
+ }
- if (!scrubber.time_for_deep && has_deep_errors) {
- osd->clog->info() << "osd." << osd->whoami
- << " pg " << info.pgid
- << " Deep scrub errors, upgrading scrub to deep-scrub";
- scrubber.time_for_deep = true;
- }
+ // try to reserve the local OSD resources. If failing: no harm. We will
+ // be retried by the OSD later on.
+ if (!m_scrubber->reserve_local()) {
+ dout(10) << __func__ << ": failed to reserve locally" << dendl;
+ return false;
+ }
- if (try_to_auto_repair) {
- if (scrubber.time_for_deep) {
- dout(20) << __func__ << ": auto repair with deep scrubbing" << dendl;
- scrubber.auto_repair = true;
- } else if (allow_scrub) {
- dout(20) << __func__ << ": auto repair with scrubbing, rescrub if errors found" << dendl;
- scrubber.deep_scrub_on_error = true;
- }
- }
- } else { // !allow_deep_scrub
- dout(20) << __func__ << ": nodeep_scrub set" << dendl;
- if (has_deep_errors) {
- osd->clog->error() << "osd." << osd->whoami
- << " pg " << info.pgid
- << " Regular scrub skipped due to deep-scrub errors and nodeep-scrub set";
- return false;
- }
- }
+ // can commit to the updated flags now, as nothing will stop the scrub
+ m_planned_scrub = *updated_flags;
- //NOSCRUB so skip regular scrubs
- if (!allow_scrub && !scrubber.time_for_deep) {
- return false;
- }
- // scrubber.must_scrub
- } else if (!scrubber.must_deep_scrub && has_deep_errors) {
- osd->clog->error() << "osd." << osd->whoami
- << " pg " << info.pgid
- << " Regular scrub request, deep-scrub details will be lost";
- }
- // Unless precluded this was handle above
- scrubber.need_auto = false;
-
- ceph_assert(scrubber.reserved_peers.empty());
- bool allow_scrubing = cct->_conf->osd_scrub_during_recovery ||
- (cct->_conf->osd_repair_during_recovery && scrubber.must_repair) ||
- !osd->is_recovery_active();
- if (allow_scrubing &&
- osd->inc_scrubs_local()) {
- dout(20) << __func__ << ": reserved locally, reserving replicas" << dendl;
- scrubber.local_reserved = true;
- scrubber.reserved_peers.insert(pg_whoami);
- scrub_reserve_replicas();
- } else {
- dout(20) << __func__ << ": failed to reserve locally" << dendl;
- return false;
- }
+ // An interrupted recovery repair could leave this set.
+ state_clear(PG_STATE_REPAIR);
+
+ // Pass control to the scrubber. It is the scrubber that handles the replicas'
+ // resources reservations.
+ m_scrubber->set_op_parameters(m_planned_scrub);
+
+ dout(10) << __func__ << ": queueing" << dendl;
+
+ scrub_queued = true;
+ osd->queue_for_scrub(this, Scrub::scrub_prio_t::low_priority);
+ return true;
+}
+
+double PG::next_deepscrub_interval() const
+{
+ double deep_scrub_interval =
+ pool.info.opts.value_or(pool_opts_t::DEEP_SCRUB_INTERVAL, 0.0);
+ if (deep_scrub_interval <= 0.0)
+ deep_scrub_interval = cct->_conf->osd_deep_scrub_interval;
+ return info.history.last_deep_scrub_stamp + deep_scrub_interval;
+}
+
+bool PG::is_time_for_deep(bool allow_deep_scrub,
+ bool allow_scrub,
+ bool has_deep_errors,
+ const requested_scrub_t& planned) const
+{
+ dout(10) << __func__ << ": need_auto?" << planned.need_auto << " allow_deep_scrub? " << allow_deep_scrub << dendl;
+
+ if (!allow_deep_scrub)
+ return false;
+
+ if (planned.need_auto) {
+ dout(10) << __func__ << ": need repair after scrub errors" << dendl;
+ return true;
}
- if (scrubber.local_reserved) {
- if (scrubber.reserve_failed) {
- dout(20) << __func__ << ": failed, a peer declined" << dendl;
- clear_scrub_reserved();
- scrub_unreserve_replicas();
+ if (ceph_clock_now() >= next_deepscrub_interval())
+ return true;
+
+ if (has_deep_errors) {
+ osd->clog->info() << "osd." << osd->whoami << " pg " << info.pgid
+ << " Deep scrub errors, upgrading scrub to deep-scrub";
+ return true;
+ }
+
+ // we only flip coins if 'allow_scrub' is asserted. Otherwise - as this function is
+ // called often, we will probably be deep-scrubbing most of the time.
+ if (allow_scrub) {
+ bool deep_coin_flip =
+ (rand() % 100) < cct->_conf->osd_deep_scrub_randomize_ratio * 100;
+
+ dout(15) << __func__ << ": time_for_deep=" << planned.time_for_deep
+ << " deep_coin_flip=" << deep_coin_flip << dendl;
+
+ if (deep_coin_flip)
+ return true;
+ }
+
+ return false;
+}
+
+bool PG::verify_periodic_scrub_mode(bool allow_deep_scrub,
+ bool try_to_auto_repair,
+ bool allow_regular_scrub,
+ bool has_deep_errors,
+ requested_scrub_t& planned) const
+
+{
+ ceph_assert(!planned.must_deep_scrub && !planned.must_repair);
+
+ if (!allow_deep_scrub && has_deep_errors) {
+ osd->clog->error()
+ << "osd." << osd->whoami << " pg " << info.pgid
+ << " Regular scrub skipped due to deep-scrub errors and nodeep-scrub set";
return false;
- } else if (scrubber.reserved_peers.size() == get_actingset().size()) {
- dout(20) << __func__ << ": success, reserved self and replicas" << dendl;
- if (scrubber.time_for_deep) {
- dout(10) << __func__ << ": scrub will be deep" << dendl;
- state_set(PG_STATE_DEEP_SCRUB);
- scrubber.time_for_deep = false;
+ }
+
+ if (allow_deep_scrub) {
+ // Initial entry and scheduled scrubs without nodeep_scrub set get here
+
+ planned.time_for_deep =
+ is_time_for_deep(allow_deep_scrub, allow_regular_scrub, has_deep_errors, planned);
+
+ if (try_to_auto_repair) {
+ if (planned.time_for_deep) {
+ dout(20) << __func__ << ": auto repair with deep scrubbing" << dendl;
+ planned.auto_repair = true;
+ } else if (allow_regular_scrub) {
+ dout(20) << __func__ << ": auto repair with scrubbing, rescrub if errors found"
+ << dendl;
+ planned.deep_scrub_on_error = true;
}
- queue_scrub();
- } else {
- // none declined, since scrubber.reserved is set
- dout(20) << __func__ << ": reserved " << scrubber.reserved_peers
- << ", waiting for replicas" << dendl;
}
}
+
+ dout(20) << __func__ << " updated flags: " << planned
+ << " allow_regular_scrub: " << allow_regular_scrub << dendl;
+
+ // NOSCRUB so skip regular scrubs
+ if (!allow_regular_scrub && !planned.time_for_deep) {
+ return false;
+ }
+
return true;
}
-bool PG::is_scrub_registered()
+std::optional<requested_scrub_t> PG::verify_scrub_mode() const
{
- return !scrubber.scrub_reg_stamp.is_zero();
-}
+ dout(10) << __func__ << " processing pg " << info.pgid << dendl;
-void PG::reg_next_scrub()
-{
- if (!is_primary())
- return;
+ bool allow_deep_scrub = !(get_osdmap()->test_flag(CEPH_OSDMAP_NODEEP_SCRUB) ||
+ pool.info.has_flag(pg_pool_t::FLAG_NODEEP_SCRUB));
+ bool allow_regular_scrub = !(get_osdmap()->test_flag(CEPH_OSDMAP_NOSCRUB) ||
+ pool.info.has_flag(pg_pool_t::FLAG_NOSCRUB));
+ bool has_deep_errors = (info.stats.stats.sum.num_deep_scrub_errors > 0);
+ bool try_to_auto_repair =
+ (cct->_conf->osd_scrub_auto_repair && get_pgbackend()->auto_repair_supported());
- utime_t reg_stamp;
- bool must = false;
- if (scrubber.must_scrub || scrubber.need_auto) {
- // Set the smallest time that isn't utime_t()
- reg_stamp = Scrubber::scrub_must_stamp();
- must = true;
- } else if (info.stats.stats_invalid && cct->_conf->osd_scrub_invalid_stats) {
- reg_stamp = ceph_clock_now();
- must = true;
- } else {
- reg_stamp = info.history.last_scrub_stamp;
+ auto upd_flags = m_planned_scrub;
+
+ upd_flags.time_for_deep = false;
+ // Clear these in case user issues the scrub/repair command during
+ // the scheduling of the scrub/repair (e.g. request reservation)
+ upd_flags.deep_scrub_on_error = false;
+ upd_flags.auto_repair = false;
+
+ if (upd_flags.must_scrub && !upd_flags.must_deep_scrub && has_deep_errors) {
+ osd->clog->error() << "osd." << osd->whoami << " pg " << info.pgid
+ << " Regular scrub request, deep-scrub details will be lost";
+ }
+
+ if (!upd_flags.must_scrub) {
+ // All periodic scrub handling goes here because must_scrub is
+ // always set for must_deep_scrub and must_repair.
+
+ bool can_start_periodic =
+ verify_periodic_scrub_mode(allow_deep_scrub, try_to_auto_repair,
+ allow_regular_scrub, has_deep_errors, upd_flags);
+ if (!can_start_periodic) {
+ return std::nullopt;
+ }
}
- // note down the sched_time, so we can locate this scrub, and remove it
- // later on.
- double scrub_min_interval = 0, scrub_max_interval = 0;
- pool.info.opts.get(pool_opts_t::SCRUB_MIN_INTERVAL, &scrub_min_interval);
- pool.info.opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &scrub_max_interval);
- ceph_assert(!is_scrub_registered());
- scrubber.scrub_reg_stamp = osd->reg_pg_scrub(info.pgid,
- reg_stamp,
- scrub_min_interval,
- scrub_max_interval,
- must);
- dout(10) << __func__ << " pg " << pg_id << " register next scrub, scrub time "
- << scrubber.scrub_reg_stamp << ", must = " << (int)must << dendl;
-}
-
-void PG::unreg_next_scrub()
-{
- if (is_scrub_registered()) {
- osd->unreg_pg_scrub(info.pgid, scrubber.scrub_reg_stamp);
- scrubber.scrub_reg_stamp = utime_t();
+
+ // scrubbing while recovering?
+
+ bool prevented_by_recovery =
+ osd->is_recovery_active() && !cct->_conf->osd_scrub_during_recovery &&
+ (!cct->_conf->osd_repair_during_recovery || !upd_flags.must_repair);
+
+ if (prevented_by_recovery) {
+ dout(20) << __func__ << ": scrubbing prevented during recovery" << dendl;
+ return std::nullopt;
}
+
+ upd_flags.need_auto = false;
+ return upd_flags;
+}
+
+void PG::reg_next_scrub()
+{
+ m_scrubber->reg_next_scrub(m_planned_scrub);
}
void PG::on_info_history_change()
{
- unreg_next_scrub();
- reg_next_scrub();
+ m_scrubber->unreg_next_scrub();
+ m_scrubber->reg_next_scrub(m_planned_scrub);
}
-void PG::scrub_requested(bool deep, bool repair, bool need_auto)
+void PG::scrub_requested(scrub_level_t scrub_level, scrub_type_t scrub_type)
{
- unreg_next_scrub();
- if (need_auto) {
- scrubber.need_auto = true;
- } else {
- scrubber.must_scrub = true;
- scrubber.must_deep_scrub = deep || repair;
- scrubber.must_repair = repair;
- // User might intervene, so clear this
- scrubber.need_auto = false;
- scrubber.req_scrub = true;
- }
- reg_next_scrub();
+ m_scrubber->scrub_requested(scrub_level, scrub_type, m_planned_scrub);
}
void PG::clear_ready_to_merge() {
}
void PG::on_new_interval() {
+ dout(20) << __func__ << " scrub_queued was " << scrub_queued << " flags: " << m_planned_scrub << dendl;
scrub_queued = false;
projected_last_update = eversion_t();
cancel_recovery();
t.register_on_commit(new QueuePeeringEvt(this, on_commit));
}
+void PG::on_activate(interval_set<snapid_t> snaps)
+{
+ ceph_assert(!m_scrubber->are_callbacks_pending());
+ ceph_assert(callbacks_for_degraded_object.empty());
+ snap_trimq = snaps;
+ release_pg_backoffs();
+ projected_last_update = info.last_update;
+}
+
void PG::on_active_exit()
{
backfill_reserving = false;
}
}
-void PG::do_replica_scrub_map(OpRequestRef op)
-{
- auto m = op->get_req<MOSDRepScrubMap>();
- dout(7) << __func__ << " " << *m << dendl;
- if (m->map_epoch < info.history.same_interval_since) {
- dout(10) << __func__ << " discarding old from "
- << m->map_epoch << " < " << info.history.same_interval_since
- << dendl;
- return;
- }
- if (!scrubber.is_chunky_scrub_active()) {
- dout(10) << __func__ << " scrub isn't active" << dendl;
- return;
- }
-
- op->mark_started();
-
- auto p = const_cast<bufferlist&>(m->get_data()).cbegin();
- scrubber.received_maps[m->from].decode(p, info.pgid.pool());
- dout(10) << "map version is "
- << scrubber.received_maps[m->from].valid_through
- << dendl;
-
- dout(10) << __func__ << " waiting_on_whom was " << scrubber.waiting_on_whom
- << dendl;
- ceph_assert(scrubber.waiting_on_whom.count(m->from));
- scrubber.waiting_on_whom.erase(m->from);
- if (m->preempted) {
- dout(10) << __func__ << " replica was preempted, setting flag" << dendl;
- scrub_preempted = true;
- }
- if (scrubber.waiting_on_whom.empty()) {
- requeue_scrub(ops_blocked_by_scrub());
- }
-}
-
-// send scrub v3 messages (chunky scrub)
-void PG::_request_scrub_map(
- pg_shard_t replica, eversion_t version,
- hobject_t start, hobject_t end,
- bool deep,
- bool allow_preemption)
-{
- ceph_assert(replica != pg_whoami);
- dout(10) << "scrub requesting scrubmap from osd." << replica
- << " deep " << (int)deep << dendl;
- MOSDRepScrub *repscrubop = new MOSDRepScrub(
- spg_t(info.pgid.pgid, replica.shard), version,
- get_osdmap_epoch(),
- get_last_peering_reset(),
- start, end, deep,
- allow_preemption,
- scrubber.priority,
- ops_blocked_by_scrub());
- // default priority, we want the rep scrub processed prior to any recovery
- // or client io messages (we are holding a lock!)
- osd->send_message_osd_cluster(
- replica.osd, repscrubop, get_osdmap_epoch());
-}
-
-void PG::handle_scrub_reserve_request(OpRequestRef op)
-{
- dout(7) << __func__ << " " << *op->get_req() << dendl;
- op->mark_started();
- if (scrubber.remote_reserved) {
- dout(10) << __func__ << " ignoring reserve request: Already reserved"
- << dendl;
- return;
- }
- if ((cct->_conf->osd_scrub_during_recovery || !osd->is_recovery_active()) &&
- osd->inc_scrubs_remote()) {
- scrubber.remote_reserved = true;
- } else {
- dout(20) << __func__ << ": failed to reserve remotely" << dendl;
- scrubber.remote_reserved = false;
- }
- auto m = op->get_req<MOSDScrubReserve>();
- Message *reply = new MOSDScrubReserve(
- spg_t(info.pgid.pgid, get_primary().shard),
- m->map_epoch,
- scrubber.remote_reserved ? MOSDScrubReserve::GRANT : MOSDScrubReserve::REJECT,
- pg_whoami);
- osd->send_message_osd_cluster(reply, op->get_req()->get_connection());
-}
-
-void PG::handle_scrub_reserve_grant(OpRequestRef op, pg_shard_t from)
-{
- dout(7) << __func__ << " " << *op->get_req() << dendl;
- op->mark_started();
- if (!scrubber.local_reserved) {
- dout(10) << "ignoring obsolete scrub reserve reply" << dendl;
- return;
- }
- if (scrubber.reserved_peers.find(from) != scrubber.reserved_peers.end()) {
- dout(10) << " already had osd." << from << " reserved" << dendl;
- } else {
- dout(10) << " osd." << from << " scrub reserve = success" << dendl;
- scrubber.reserved_peers.insert(from);
- sched_scrub();
- }
-}
-
-void PG::handle_scrub_reserve_reject(OpRequestRef op, pg_shard_t from)
-{
- dout(7) << __func__ << " " << *op->get_req() << dendl;
- op->mark_started();
- if (!scrubber.local_reserved) {
- dout(10) << "ignoring obsolete scrub reserve reply" << dendl;
- return;
- }
- if (scrubber.reserved_peers.find(from) != scrubber.reserved_peers.end()) {
- dout(10) << " already had osd." << from << " reserved" << dendl;
- } else {
- /* One decline stops this pg from being scheduled for scrubbing. */
- dout(10) << " osd." << from << " scrub reserve = fail" << dendl;
- scrubber.reserve_failed = true;
- sched_scrub();
- }
-}
-
-void PG::handle_scrub_reserve_release(OpRequestRef op)
-{
- dout(7) << __func__ << " " << *op->get_req() << dendl;
- op->mark_started();
- clear_scrub_reserved();
-}
-
// Compute pending backfill data
static int64_t pending_backfill(CephContext *cct, int64_t bf_bytes, int64_t local_bytes)
{
void PG::unreserve_recovery_space() {
primary_num_bytes.store(0);
local_num_bytes.store(0);
- return;
-}
-
-void PG::clear_scrub_reserved()
-{
- scrubber.reserved_peers.clear();
- scrubber.reserve_failed = false;
-
- if (scrubber.local_reserved) {
- scrubber.local_reserved = false;
- osd->dec_scrubs_local();
- }
- if (scrubber.remote_reserved) {
- scrubber.remote_reserved = false;
- osd->dec_scrubs_remote();
- }
-}
-
-void PG::scrub_reserve_replicas()
-{
- ceph_assert(recovery_state.get_backfill_targets().empty());
- std::vector<std::pair<int, Message*>> messages;
- messages.reserve(get_actingset().size());
- epoch_t e = get_osdmap_epoch();
- for (set<pg_shard_t>::iterator i = get_actingset().begin();
- i != get_actingset().end();
- ++i) {
- if (*i == pg_whoami) continue;
- dout(10) << "scrub requesting reserve from osd." << *i << dendl;
- Message* m = new MOSDScrubReserve(spg_t(info.pgid.pgid, i->shard), e,
- MOSDScrubReserve::REQUEST, pg_whoami);
- messages.push_back(std::make_pair(i->osd, m));
- }
- if (!messages.empty()) {
- osd->send_message_osd_cluster(messages, e);
- }
-}
-
-void PG::scrub_unreserve_replicas()
-{
- ceph_assert(recovery_state.get_backfill_targets().empty());
- std::vector<std::pair<int, Message*>> messages;
- messages.reserve(get_actingset().size());
- epoch_t e = get_osdmap_epoch();
- for (set<pg_shard_t>::iterator i = get_actingset().begin();
- i != get_actingset().end();
- ++i) {
- if (*i == pg_whoami) continue;
- dout(10) << "scrub requesting unreserve from osd." << *i << dendl;
- Message* m = new MOSDScrubReserve(spg_t(info.pgid.pgid, i->shard), e,
- MOSDScrubReserve::RELEASE, pg_whoami);
- messages.push_back(std::make_pair(i->osd, m));
- }
- if (!messages.empty()) {
- osd->send_message_osd_cluster(messages, e);
- }
}
void PG::_scan_rollback_obs(const vector<ghobject_t> &rollback_obs)
}
}
-void PG::_scan_snaps(ScrubMap &smap)
-{
- hobject_t head;
- SnapSet snapset;
-
- // Test qa/standalone/scrub/osd-scrub-snaps.sh uses this message to verify
- // caller using clean_meta_map(), and it works properly.
- dout(20) << __func__ << " start" << dendl;
-
- for (map<hobject_t, ScrubMap::object>::reverse_iterator i = smap.objects.rbegin();
- i != smap.objects.rend();
- ++i) {
- const hobject_t &hoid = i->first;
- ScrubMap::object &o = i->second;
-
- dout(20) << __func__ << " " << hoid << dendl;
-
- ceph_assert(!hoid.is_snapdir());
- if (hoid.is_head()) {
- // parse the SnapSet
- bufferlist bl;
- if (o.attrs.find(SS_ATTR) == o.attrs.end()) {
- continue;
- }
- bl.push_back(o.attrs[SS_ATTR]);
- auto p = bl.cbegin();
- try {
- decode(snapset, p);
- } catch(...) {
- continue;
- }
- head = hoid.get_head();
- continue;
- }
- if (hoid.snap < CEPH_MAXSNAP) {
- // check and if necessary fix snap_mapper
- if (hoid.get_head() != head) {
- derr << __func__ << " no head for " << hoid << " (have " << head << ")"
- << dendl;
- continue;
- }
- set<snapid_t> obj_snaps;
- auto p = snapset.clone_snaps.find(hoid.snap);
- if (p == snapset.clone_snaps.end()) {
- derr << __func__ << " no clone_snaps for " << hoid << " in " << snapset
- << dendl;
- continue;
- }
- obj_snaps.insert(p->second.begin(), p->second.end());
- set<snapid_t> cur_snaps;
- int r = snap_mapper.get_snaps(hoid, &cur_snaps);
- if (r != 0 && r != -ENOENT) {
- derr << __func__ << ": get_snaps returned " << cpp_strerror(r) << dendl;
- ceph_abort();
- }
- if (r == -ENOENT || cur_snaps != obj_snaps) {
- ObjectStore::Transaction t;
- OSDriver::OSTransaction _t(osdriver.get_transaction(&t));
- if (r == 0) {
- r = snap_mapper.remove_oid(hoid, &_t);
- if (r != 0) {
- derr << __func__ << ": remove_oid returned " << cpp_strerror(r)
- << dendl;
- ceph_abort();
- }
- osd->clog->error() << "osd." << osd->whoami
- << " found snap mapper error on pg "
- << info.pgid
- << " oid " << hoid << " snaps in mapper: "
- << cur_snaps << ", oi: "
- << obj_snaps
- << "...repaired";
- } else {
- osd->clog->error() << "osd." << osd->whoami
- << " found snap mapper error on pg "
- << info.pgid
- << " oid " << hoid << " snaps missing in mapper"
- << ", should be: "
- << obj_snaps
- << " was " << cur_snaps << " r " << r
- << "...repaired";
- }
- snap_mapper.add_oid(hoid, obj_snaps, &_t);
-
- // wait for repair to apply to avoid confusing other bits of the system.
- {
- ceph::condition_variable my_cond;
- ceph::mutex my_lock = ceph::make_mutex("PG::_scan_snaps my_lock");
- int r = 0;
- bool done;
- t.register_on_applied_sync(
- new C_SafeCond(my_lock, my_cond, &done, &r));
- r = osd->store->queue_transaction(ch, std::move(t));
- if (r != 0) {
- derr << __func__ << ": queue_transaction got " << cpp_strerror(r)
- << dendl;
- } else {
- std::unique_lock l{my_lock};
- my_cond.wait(l, [&done] { return done;});
- }
- }
- }
- }
- }
-}
void PG::_repair_oinfo_oid(ScrubMap &smap)
{
}
}
}
-int PG::build_scrub_map_chunk(
- ScrubMap &map,
- ScrubMapBuilder &pos,
- hobject_t start,
- hobject_t end,
- bool deep,
- ThreadPool::TPHandle &handle)
-{
- dout(10) << __func__ << " [" << start << "," << end << ") "
- << " pos " << pos
- << dendl;
-
- // start
- while (pos.empty()) {
- pos.deep = deep;
- map.valid_through = info.last_update;
-
- // objects
- vector<ghobject_t> rollback_obs;
- pos.ret = get_pgbackend()->objects_list_range(
- start,
- end,
- &pos.ls,
- &rollback_obs);
- if (pos.ret < 0) {
- dout(5) << "objects_list_range error: " << pos.ret << dendl;
- return pos.ret;
- }
- if (pos.ls.empty()) {
- break;
- }
- _scan_rollback_obs(rollback_obs);
- pos.pos = 0;
- return -EINPROGRESS;
- }
-
- // scan objects
- while (!pos.done()) {
- int r = get_pgbackend()->be_scan_list(map, pos);
- if (r == -EINPROGRESS) {
- return r;
- }
- }
-
- // finish
- dout(20) << __func__ << " finishing" << dendl;
- ceph_assert(pos.done());
- _repair_oinfo_oid(map);
- if (!is_primary()) {
- ScrubMap for_meta_scrub;
- // In case we restarted smaller chunk, clear old data
- scrubber.cleaned_meta_map.clear_from(scrubber.start);
- scrubber.cleaned_meta_map.insert(map);
- scrubber.clean_meta_map(for_meta_scrub);
- _scan_snaps(for_meta_scrub);
- }
-
- dout(20) << __func__ << " done, got " << map.objects.size() << " items"
- << dendl;
- return 0;
-}
-
-void PG::Scrubber::cleanup_store(ObjectStore::Transaction *t) {
- if (!store)
- return;
- struct OnComplete : Context {
- std::unique_ptr<Scrub::Store> store;
- explicit OnComplete(
- std::unique_ptr<Scrub::Store> &&store)
- : store(std::move(store)) {}
- void finish(int) override {}
- };
- store->cleanup(t);
- t->register_on_complete(new OnComplete(std::move(store)));
- ceph_assert(!store);
-}
void PG::repair_object(
const hobject_t &soid,
recovery_state.force_object_missing(bad_peers, soid, oi.version);
}
-/* replica_scrub
- *
- * Wait for last_update_applied to match msg->scrub_to as above. Wait
- * for pushes to complete in case of recent recovery. Build a single
- * scrubmap of objects that are in the range [msg->start, msg->end).
- */
-void PG::replica_scrub(
- OpRequestRef op,
- ThreadPool::TPHandle &handle)
+void PG::replica_scrub(OpRequestRef op, ThreadPool::TPHandle& handle)
{
- auto msg = op->get_req<MOSDRepScrub>();
- ceph_assert(!scrubber.active_rep_scrub);
- dout(7) << "replica_scrub" << dendl;
+ dout(10) << __func__ << " (op)" << dendl;
+ m_scrubber->replica_scrub_op(op);
+}
- if (msg->map_epoch < info.history.same_interval_since) {
- dout(10) << "replica_scrub discarding old replica_scrub from "
- << msg->map_epoch << " < " << info.history.same_interval_since
- << dendl;
- return;
- }
+void PG::scrub(epoch_t queued, ThreadPool::TPHandle& handle)
+{
+ dout(10) << __func__ << (is_primary() ? " (primary)" : " (replica)") << dendl;
- ceph_assert(msg->chunky);
- if (active_pushes > 0) {
- dout(10) << "waiting for active pushes to finish" << dendl;
- scrubber.active_rep_scrub = op;
- return;
- }
+ scrub_queued = false;
- scrubber.state = Scrubber::BUILD_MAP_REPLICA;
- scrubber.replica_scrub_start = msg->min_epoch;
- scrubber.start = msg->start;
- scrubber.end = msg->end;
- scrubber.max_end = msg->end;
- scrubber.deep = msg->deep;
- scrubber.epoch_start = info.history.same_interval_since;
- if (msg->priority) {
- scrubber.priority = msg->priority;
- } else {
- scrubber.priority = get_scrub_priority();
+ if (pg_has_reset_since(queued)) {
+ dout(10) << " pg::scrub reset_since " << __func__ << " " << queued << dendl;
+ dout(10) << " pg::scrub reset_since " << __func__ << " "
+ << recovery_state.get_last_peering_reset() << dendl;
+ m_scrubber->scrub_clear_state(false);
+ return;
}
- scrub_can_preempt = msg->allow_preemption;
- scrub_preempted = false;
- scrubber.replica_scrubmap_pos.reset();
+ ceph_assert(
+ is_primary()); // as the replica request should have reached PG::replica_scrub()
- requeue_scrub(msg->high_priority);
+ ceph_assert(!m_scrubber->is_scrub_active());
+ // a new scrub
+ m_scrubber->reset_epoch(queued);
+ m_scrubber->send_start_scrub();
}
-/* Scrub:
- * PG_STATE_SCRUBBING is set when the scrub is queued
- *
- * scrub will be chunky if all OSDs in PG support chunky scrub
- * scrub will fail if OSDs are too old.
- */
-void PG::scrub(epoch_t queued, ThreadPool::TPHandle &handle)
-{
- OSDService *osds = osd;
- double scrub_sleep = osds->osd->scrub_sleep_time(scrubber.must_scrub);
- if (scrub_sleep > 0 &&
- (scrubber.state == PG::Scrubber::NEW_CHUNK ||
- scrubber.state == PG::Scrubber::INACTIVE) &&
- scrubber.needs_sleep) {
- ceph_assert(!scrubber.sleeping);
- dout(20) << __func__ << " state is INACTIVE|NEW_CHUNK, sleeping" << dendl;
-
- // Do an async sleep so we don't block the op queue
- spg_t pgid = get_pgid();
- int state = scrubber.state;
- auto scrub_requeue_callback =
- new LambdaContext([osds, pgid, state](int r) {
- PGRef pg = osds->osd->lookup_lock_pg(pgid);
- if (pg == nullptr) {
- lgeneric_dout(osds->osd->cct, 20)
- << "scrub_requeue_callback: Could not find "
- << "PG " << pgid << " can't complete scrub requeue after sleep"
- << dendl;
- return;
- }
- pg->scrubber.sleeping = false;
- pg->scrubber.needs_sleep = false;
- lgeneric_dout(pg->cct, 20)
- << "scrub_requeue_callback: slept for "
- << ceph_clock_now() - pg->scrubber.sleep_start
- << ", re-queuing scrub with state " << state << dendl;
- pg->scrub_queued = false;
- pg->requeue_scrub();
- pg->scrubber.sleep_start = utime_t();
- pg->unlock();
- });
- std::lock_guard l(osd->sleep_lock);
- osd->sleep_timer.add_event_after(scrub_sleep,
- scrub_requeue_callback);
- scrubber.sleeping = true;
- scrubber.sleep_start = ceph_clock_now();
- return;
- }
- if (pg_has_reset_since(queued)) {
- return;
- }
- ceph_assert(scrub_queued);
- scrub_queued = false;
- scrubber.needs_sleep = true;
+// note: no need to secure OSD resources for a recovery scrub
+void PG::recovery_scrub(epoch_t epoch_queued, ThreadPool::TPHandle& handle)
+{
+ dout(10) << "pg::" << __func__ << " queued at: " << epoch_queued << dendl;
- // for the replica
- if (!is_primary() &&
- scrubber.state == PG::Scrubber::BUILD_MAP_REPLICA) {
- chunky_scrub(handle);
- return;
- }
+ scrub_queued = false;
- if (!is_primary() || !is_active() || !is_clean() || !is_scrubbing()) {
- dout(10) << "scrub -- not primary or active or not clean" << dendl;
- state_clear(PG_STATE_SCRUBBING);
- state_clear(PG_STATE_REPAIR);
- state_clear(PG_STATE_DEEP_SCRUB);
- publish_stats_to_osd();
+ if (pg_has_reset_since(epoch_queued)) {
+ dout(10) << " reset_since " << __func__ << " " << epoch_queued << dendl;
+ dout(10) << " reset_since " << __func__ << " "
+ << recovery_state.get_last_peering_reset() << dendl;
return;
}
- if (!scrubber.active) {
- ceph_assert(recovery_state.get_backfill_targets().empty());
-
- scrubber.deep = state_test(PG_STATE_DEEP_SCRUB);
+ ceph_assert(is_primary());
+ ceph_assert(!m_scrubber->is_scrub_active());
- dout(10) << "starting a new chunky scrub" << dendl;
- }
-
- chunky_scrub(handle);
+ // a new scrub
+ m_scrubber->reset_epoch(epoch_queued);
+ m_scrubber->send_start_after_repair();
}
-void PG::abort_scrub()
+void PG::replica_scrub(epoch_t epoch_queued,
+ [[maybe_unused]] ThreadPool::TPHandle& handle)
{
- scrub_clear_state();
- scrub_unreserve_replicas();
+ dout(10) << "pg::" << __func__ << " queued at: " << epoch_queued
+ << (is_primary() ? " (primary)" : " (replica)") << dendl;
+ scrub_queued = false;
+ m_scrubber->replica_scrub(epoch_queued);
}
-/*
- * Chunky scrub scrubs objects one chunk at a time with writes blocked for that
- * chunk.
- *
- * The object store is partitioned into chunks which end on hash boundaries. For
- * each chunk, the following logic is performed:
- *
- * (1) Block writes on the chunk
- * (2) Request maps from replicas
- * (3) Wait for pushes to be applied (after recovery)
- * (4) Wait for writes to flush on the chunk
- * (5) Wait for maps from replicas
- * (6) Compare / repair all scrub maps
- * (7) Wait for digest updates to apply
- *
- * This logic is encoded in the mostly linear state machine:
- *
- * +------------------+
- * _________v__________ |
- * | | |
- * | INACTIVE | |
- * |____________________| |
- * | |
- * | +----------+ |
- * _________v___v______ | |
- * | | | |
- * | NEW_CHUNK | | |
- * |____________________| | |
- * | | |
- * _________v__________ | |
- * | | | |
- * | WAIT_PUSHES | | |
- * |____________________| | |
- * | | |
- * _________v__________ | |
- * | | | |
- * | WAIT_LAST_UPDATE | | |
- * |____________________| | |
- * | | |
- * _________v__________ | |
- * | | | |
- * | BUILD_MAP | | |
- * |____________________| | |
- * | | |
- * _________v__________ | |
- * | | | |
- * | WAIT_REPLICAS | | |
- * |____________________| | |
- * | | |
- * _________v__________ | |
- * | | | |
- * | COMPARE_MAPS | | |
- * |____________________| | |
- * | | |
- * | | |
- * _________v__________ | |
- * | | | |
- * |WAIT_DIGEST_UPDATES | | |
- * |____________________| | |
- * | | | |
- * | +----------+ |
- * _________v__________ |
- * | | |
- * | FINISH | |
- * |____________________| |
- * | |
- * +------------------+
- *
- * The primary determines the last update from the subset by walking the log. If
- * it sees a log entry pertaining to a file in the chunk, it tells the replicas
- * to wait until that update is applied before building a scrub map. Both the
- * primary and replicas will wait for any active pushes to be applied.
- *
- * In contrast to classic_scrub, chunky_scrub is entirely handled by scrub_wq.
- *
- * scrubber.state encodes the current state of the scrub (refer to state diagram
- * for details).
- */
-void PG::chunky_scrub(ThreadPool::TPHandle &handle)
-{
- // Since repair is only by request and we need to scrub afterward
- // treat the same as req_scrub.
- if (!scrubber.req_scrub) {
- if (state_test(PG_STATE_DEEP_SCRUB)) {
- if (get_osdmap()->test_flag(CEPH_OSDMAP_NODEEP_SCRUB) ||
- pool.info.has_flag(pg_pool_t::FLAG_NODEEP_SCRUB)) {
- dout(10) << "nodeep_scrub set, aborting" << dendl;
- abort_scrub();
- return;
- }
- } else if (state_test(PG_STATE_SCRUBBING)) {
- if (get_osdmap()->test_flag(CEPH_OSDMAP_NOSCRUB) || pool.info.has_flag(pg_pool_t::FLAG_NOSCRUB)) {
- dout(10) << "noscrub set, aborting" << dendl;
- abort_scrub();
- return;
- }
- }
- }
- // check for map changes
- if (scrubber.is_chunky_scrub_active()) {
- if (scrubber.epoch_start != info.history.same_interval_since) {
- dout(10) << "scrub pg changed, aborting" << dendl;
- abort_scrub();
- return;
- }
- }
-
- bool done = false;
- int ret;
-
- while (!done) {
- dout(20) << "scrub state " << Scrubber::state_string(scrubber.state)
- << " [" << scrubber.start << "," << scrubber.end << ")"
- << " max_end " << scrubber.max_end << dendl;
-
- switch (scrubber.state) {
- case PG::Scrubber::INACTIVE:
- dout(10) << "scrub start" << dendl;
- ceph_assert(is_primary());
-
- publish_stats_to_osd();
- scrubber.epoch_start = info.history.same_interval_since;
- scrubber.active = true;
-
- {
- ObjectStore::Transaction t;
- scrubber.cleanup_store(&t);
- scrubber.store.reset(Scrub::Store::create(osd->store, &t,
- info.pgid, coll));
- osd->store->queue_transaction(ch, std::move(t), nullptr);
- }
-
- // Don't include temporary objects when scrubbing
- scrubber.start = info.pgid.pgid.get_hobj_start();
- scrubber.state = PG::Scrubber::NEW_CHUNK;
-
- {
- bool repair = state_test(PG_STATE_REPAIR);
- bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB);
- const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub"));
- stringstream oss;
- oss << info.pgid.pgid << " " << mode << " starts" << std::endl;
- osd->clog->debug(oss);
- }
-
- scrubber.preempt_left = cct->_conf.get_val<uint64_t>(
- "osd_scrub_max_preemptions");
- scrubber.preempt_divisor = 1;
- break;
-
- case PG::Scrubber::NEW_CHUNK:
- scrubber.primary_scrubmap = ScrubMap();
- scrubber.received_maps.clear();
-
- // begin (possible) preemption window
- if (scrub_preempted) {
- scrubber.preempt_left--;
- scrubber.preempt_divisor *= 2;
- dout(10) << __func__ << " preempted, " << scrubber.preempt_left
- << " left" << dendl;
- scrub_preempted = false;
- }
- scrub_can_preempt = scrubber.preempt_left > 0;
-
- {
- /* get the start and end of our scrub chunk
- *
- * Our scrub chunk has an important restriction we're going to need to
- * respect. We can't let head be start or end.
- * Using a half-open interval means that if end == head,
- * we'd scrub/lock head and the clone right next to head in different
- * chunks which would allow us to miss clones created between
- * scrubbing that chunk and scrubbing the chunk including head.
- * This isn't true for any of the other clones since clones can
- * only be created "just to the left of" head. There is one exception
- * to this: promotion of clones which always happens to the left of the
- * left-most clone, but promote_object checks the scrubber in that
- * case, so it should be ok. Also, it's ok to "miss" clones at the
- * left end of the range if we are a tier because they may legitimately
- * not exist (see _scrub).
- */
- ceph_assert(scrubber.preempt_divisor > 0);
- int min = std::max<int64_t>(3, cct->_conf->osd_scrub_chunk_min /
- scrubber.preempt_divisor);
- int max = std::max<int64_t>(min, cct->_conf->osd_scrub_chunk_max /
- scrubber.preempt_divisor);
- hobject_t start = scrubber.start;
- hobject_t candidate_end;
- vector<hobject_t> objects;
- ret = get_pgbackend()->objects_list_partial(
- start,
- min,
- max,
- &objects,
- &candidate_end);
- ceph_assert(ret >= 0);
-
- if (!objects.empty()) {
- hobject_t back = objects.back();
- while (candidate_end.is_head() &&
- candidate_end == back.get_head()) {
- candidate_end = back;
- objects.pop_back();
- if (objects.empty()) {
- ceph_assert(0 ==
- "Somehow we got more than 2 objects which"
- "have the same head but are not clones");
- }
- back = objects.back();
- }
- if (candidate_end.is_head()) {
- ceph_assert(candidate_end != back.get_head());
- candidate_end = candidate_end.get_object_boundary();
- }
- } else {
- ceph_assert(candidate_end.is_max());
- }
-
- if (!_range_available_for_scrub(scrubber.start, candidate_end)) {
- // we'll be requeued by whatever made us unavailable for scrub
- dout(10) << __func__ << ": scrub blocked somewhere in range "
- << "[" << scrubber.start << ", " << candidate_end << ")"
- << dendl;
- done = true;
- break;
- }
- scrubber.end = candidate_end;
- if (scrubber.end > scrubber.max_end)
- scrubber.max_end = scrubber.end;
- }
-
- // walk the log to find the latest update that affects our chunk
- scrubber.subset_last_update = eversion_t();
- for (auto p = projected_log.log.rbegin();
- p != projected_log.log.rend();
- ++p) {
- if (p->soid >= scrubber.start &&
- p->soid < scrubber.end) {
- scrubber.subset_last_update = p->version;
- break;
- }
- }
- if (scrubber.subset_last_update == eversion_t()) {
- for (list<pg_log_entry_t>::const_reverse_iterator p =
- recovery_state.get_pg_log().get_log().log.rbegin();
- p != recovery_state.get_pg_log().get_log().log.rend();
- ++p) {
- if (p->soid >= scrubber.start &&
- p->soid < scrubber.end) {
- scrubber.subset_last_update = p->version;
- break;
- }
- }
- }
-
- scrubber.state = PG::Scrubber::WAIT_PUSHES;
- break;
-
- case PG::Scrubber::WAIT_PUSHES:
- if (active_pushes == 0) {
- scrubber.state = PG::Scrubber::WAIT_LAST_UPDATE;
- } else {
- dout(15) << "wait for pushes to apply" << dendl;
- done = true;
- }
- break;
-
- case PG::Scrubber::WAIT_LAST_UPDATE:
- if (recovery_state.get_last_update_applied() <
- scrubber.subset_last_update) {
- // will be requeued by op_applied
- dout(15) << "wait for EC read/modify/writes to queue" << dendl;
- done = true;
- break;
- }
-
- // ask replicas to scan
- scrubber.waiting_on_whom.insert(pg_whoami);
-
- // request maps from replicas
- for (set<pg_shard_t>::iterator i = get_acting_recovery_backfill().begin();
- i != get_acting_recovery_backfill().end();
- ++i) {
- if (*i == pg_whoami) continue;
- _request_scrub_map(*i, scrubber.subset_last_update,
- scrubber.start, scrubber.end, scrubber.deep,
- scrubber.preempt_left > 0);
- scrubber.waiting_on_whom.insert(*i);
- }
- dout(10) << __func__ << " waiting_on_whom " << scrubber.waiting_on_whom
- << dendl;
-
- scrubber.state = PG::Scrubber::BUILD_MAP;
- scrubber.primary_scrubmap_pos.reset();
- break;
-
- case PG::Scrubber::BUILD_MAP:
- ceph_assert(recovery_state.get_last_update_applied() >=
- scrubber.subset_last_update);
-
- // build my own scrub map
- if (scrub_preempted) {
- dout(10) << __func__ << " preempted" << dendl;
- scrubber.state = PG::Scrubber::BUILD_MAP_DONE;
- break;
- }
- ret = build_scrub_map_chunk(
- scrubber.primary_scrubmap,
- scrubber.primary_scrubmap_pos,
- scrubber.start, scrubber.end,
- scrubber.deep,
- handle);
- if (ret == -EINPROGRESS) {
- requeue_scrub();
- done = true;
- break;
- }
- scrubber.state = PG::Scrubber::BUILD_MAP_DONE;
- break;
-
- case PG::Scrubber::BUILD_MAP_DONE:
- if (scrubber.primary_scrubmap_pos.ret < 0) {
- dout(5) << "error: " << scrubber.primary_scrubmap_pos.ret
- << ", aborting" << dendl;
- scrub_clear_state();
- scrub_unreserve_replicas();
- return;
- }
- dout(10) << __func__ << " waiting_on_whom was "
- << scrubber.waiting_on_whom << dendl;
- ceph_assert(scrubber.waiting_on_whom.count(pg_whoami));
- scrubber.waiting_on_whom.erase(pg_whoami);
-
- scrubber.state = PG::Scrubber::WAIT_REPLICAS;
- break;
-
- case PG::Scrubber::WAIT_REPLICAS:
- if (!scrubber.waiting_on_whom.empty()) {
- // will be requeued by do_replica_scrub_map
- dout(10) << "wait for replicas to build scrub map" << dendl;
- done = true;
- break;
- }
- // end (possible) preemption window
- scrub_can_preempt = false;
- if (scrub_preempted) {
- dout(10) << __func__ << " preempted, restarting chunk" << dendl;
- scrubber.state = PG::Scrubber::NEW_CHUNK;
- } else {
- scrubber.state = PG::Scrubber::COMPARE_MAPS;
- }
- break;
-
- case PG::Scrubber::COMPARE_MAPS:
- ceph_assert(recovery_state.get_last_update_applied() >=
- scrubber.subset_last_update);
- ceph_assert(scrubber.waiting_on_whom.empty());
-
- scrub_compare_maps();
- scrubber.start = scrubber.end;
- scrubber.run_callbacks();
-
- // requeue the writes from the chunk that just finished
- requeue_ops(waiting_for_scrub);
-
- scrubber.state = PG::Scrubber::WAIT_DIGEST_UPDATES;
-
- // fall-thru
-
- case PG::Scrubber::WAIT_DIGEST_UPDATES:
- if (scrubber.num_digest_updates_pending) {
- dout(10) << __func__ << " waiting on "
- << scrubber.num_digest_updates_pending
- << " digest updates" << dendl;
- done = true;
- break;
- }
-
- scrubber.preempt_left = cct->_conf.get_val<uint64_t>(
- "osd_scrub_max_preemptions");
- scrubber.preempt_divisor = 1;
-
- if (!(scrubber.end.is_max())) {
- scrubber.state = PG::Scrubber::NEW_CHUNK;
- requeue_scrub();
- done = true;
- } else {
- scrubber.state = PG::Scrubber::FINISH;
- }
-
- break;
-
- case PG::Scrubber::FINISH:
- scrub_finish();
- scrubber.state = PG::Scrubber::INACTIVE;
- done = true;
-
- if (!snap_trimq.empty()) {
- dout(10) << "scrub finished, requeuing snap_trimmer" << dendl;
- snap_trimmer_scrub_complete();
- }
-
- break;
-
- case PG::Scrubber::BUILD_MAP_REPLICA:
- // build my own scrub map
- if (scrub_preempted) {
- dout(10) << __func__ << " preempted" << dendl;
- ret = 0;
- } else {
- ret = build_scrub_map_chunk(
- scrubber.replica_scrubmap,
- scrubber.replica_scrubmap_pos,
- scrubber.start, scrubber.end,
- scrubber.deep,
- handle);
- }
- if (ret == -EINPROGRESS) {
- requeue_scrub();
- done = true;
- break;
- }
- // reply
- {
- MOSDRepScrubMap *reply = new MOSDRepScrubMap(
- spg_t(info.pgid.pgid, get_primary().shard),
- scrubber.replica_scrub_start,
- pg_whoami);
- reply->preempted = scrub_preempted;
- ::encode(scrubber.replica_scrubmap, reply->get_data());
- osd->send_message_osd_cluster(
- get_primary().osd, reply,
- scrubber.replica_scrub_start);
- }
- scrub_preempted = false;
- scrub_can_preempt = false;
- scrubber.state = PG::Scrubber::INACTIVE;
- scrubber.replica_scrubmap = ScrubMap();
- scrubber.replica_scrubmap_pos = ScrubMapBuilder();
- scrubber.start = hobject_t();
- scrubber.end = hobject_t();
- scrubber.max_end = hobject_t();
- done = true;
- break;
-
- default:
- ceph_abort();
- }
- }
- dout(20) << "scrub final state " << Scrubber::state_string(scrubber.state)
- << " [" << scrubber.start << "," << scrubber.end << ")"
- << " max_end " << scrubber.max_end << dendl;
+void PG::scrub_send_scrub_resched(epoch_t epoch_queued,
+ [[maybe_unused]] ThreadPool::TPHandle& handle)
+{
+ dout(10) << __func__ << (is_primary() ? " (primary)" : " (replica)") << dendl;
+ scrub_queued = false;
+ m_scrubber->send_scrub_resched();
}
-bool PG::write_blocked_by_scrub(const hobject_t& soid)
+void PG::scrub_send_resources_granted(epoch_t epoch_queued,
+ [[maybe_unused]] ThreadPool::TPHandle& handle)
{
- if (soid < scrubber.start || soid >= scrubber.end) {
- return false;
- }
- if (scrub_can_preempt) {
- if (!scrub_preempted) {
- dout(10) << __func__ << " " << soid << " preempted" << dendl;
- scrub_preempted = true;
- } else {
- dout(10) << __func__ << " " << soid << " already preempted" << dendl;
- }
- return false;
- }
- return true;
+ dout(10) << __func__ << " queued at: " << epoch_queued << dendl;
+ m_scrubber->send_remotes_reserved();
}
-bool PG::range_intersects_scrub(const hobject_t &start, const hobject_t& end)
+void PG::scrub_send_resources_denied(epoch_t epoch_queued,
+ [[maybe_unused]] ThreadPool::TPHandle& handle)
{
- // does [start, end] intersect [scrubber.start, scrubber.max_end)
- return (start < scrubber.max_end &&
- end >= scrubber.start);
+ dout(10) << __func__ << " queued at: " << epoch_queued << dendl;
+ m_scrubber->send_reservation_failure();
}
-void PG::scrub_clear_state(bool has_error)
+void PG::replica_scrub_resched(epoch_t epoch_queued,
+ [[maybe_unused]] ThreadPool::TPHandle& handle)
{
- ceph_assert(is_locked());
- state_clear(PG_STATE_SCRUBBING);
- if (!has_error)
- state_clear(PG_STATE_REPAIR);
- state_clear(PG_STATE_DEEP_SCRUB);
- publish_stats_to_osd();
-
- scrubber.req_scrub = false;
- // local -> nothing.
- if (scrubber.local_reserved) {
- osd->dec_scrubs_local();
- scrubber.local_reserved = false;
- scrubber.reserved_peers.clear();
- }
-
- requeue_ops(waiting_for_scrub);
-
- scrubber.reset();
-
- // type-specific state clear
- _scrub_clear_state();
+ dout(10) << __func__ << " queued at: " << epoch_queued << dendl;
+ scrub_queued = false;
+ m_scrubber->replica_scrub_resched(epoch_queued);
}
-void PG::scrub_compare_maps()
+void PG::scrub_send_pushes_update(epoch_t epoch_queued,
+ [[maybe_unused]] ThreadPool::TPHandle& handle)
{
- dout(10) << __func__ << " has maps, analyzing" << dendl;
-
- // construct authoritative scrub map for type specific scrubbing
- scrubber.cleaned_meta_map.insert(scrubber.primary_scrubmap);
- map<hobject_t,
- pair<std::optional<uint32_t>,
- std::optional<uint32_t>>> missing_digest;
-
- map<pg_shard_t, ScrubMap *> maps;
- maps[pg_whoami] = &scrubber.primary_scrubmap;
-
- for (const auto& i : get_acting_recovery_backfill()) {
- if (i == pg_whoami) continue;
- dout(2) << __func__ << " replica " << i << " has "
- << scrubber.received_maps[i].objects.size()
- << " items" << dendl;
- maps[i] = &scrubber.received_maps[i];
- }
-
- set<hobject_t> master_set;
-
- // Construct master set
- for (const auto& map : maps) {
- for (const auto& i : map.second->objects) {
- master_set.insert(i.first);
- }
+ dout(10) << __func__ << " queued at: " << epoch_queued << dendl;
+ if (pg_has_reset_since(epoch_queued)) {
+ dout(10) << __func__ << " been reset at "
+ << recovery_state.get_last_peering_reset() << dendl;
+ return;
}
+ m_scrubber->active_pushes_notification();
+}
- stringstream ss;
- get_pgbackend()->be_omap_checks(maps, master_set,
- scrubber.omap_stats, ss);
+void PG::scrub_send_replica_pushes(epoch_t epoch_queued,
+ [[maybe_unused]] ThreadPool::TPHandle& handle)
+{
+ dout(10) << __func__ << " queued at: " << epoch_queued << dendl;
+ m_scrubber->send_replica_pushes_upd();
+}
- if (!ss.str().empty()) {
- osd->clog->warn(ss);
+void PG::scrub_send_applied_update(epoch_t epoch_queued,
+ [[maybe_unused]] ThreadPool::TPHandle& handle)
+{
+ dout(10) << __func__ << " queued at: " << epoch_queued << dendl;
+ if (pg_has_reset_since(epoch_queued)) {
+ dout(10) << __func__ << " been reset at "
+ << recovery_state.get_last_peering_reset() << dendl;
+ return;
}
+ m_scrubber->update_applied_notification(epoch_queued);
+}
- if (recovery_state.get_acting().size() > 1) {
- dout(10) << __func__ << " comparing replica scrub maps" << dendl;
-
- // Map from object with errors to good peer
- map<hobject_t, list<pg_shard_t>> authoritative;
-
- dout(2) << __func__ << get_primary() << " has "
- << scrubber.primary_scrubmap.objects.size() << " items" << dendl;
-
- ss.str("");
- ss.clear();
-
- get_pgbackend()->be_compare_scrubmaps(
- maps,
- master_set,
- state_test(PG_STATE_REPAIR),
- scrubber.missing,
- scrubber.inconsistent,
- authoritative,
- missing_digest,
- scrubber.shallow_errors,
- scrubber.deep_errors,
- scrubber.store.get(),
- info.pgid, recovery_state.get_acting(),
- ss);
- dout(2) << ss.str() << dendl;
-
- if (!ss.str().empty()) {
- osd->clog->error(ss);
- }
-
- for (map<hobject_t, list<pg_shard_t>>::iterator i = authoritative.begin();
- i != authoritative.end();
- ++i) {
- list<pair<ScrubMap::object, pg_shard_t> > good_peers;
- for (list<pg_shard_t>::const_iterator j = i->second.begin();
- j != i->second.end();
- ++j) {
- good_peers.emplace_back(maps[*j]->objects[i->first], *j);
- }
- scrubber.authoritative.emplace(i->first, good_peers);
- }
-
- for (map<hobject_t, list<pg_shard_t>>::iterator i = authoritative.begin();
- i != authoritative.end();
- ++i) {
- scrubber.cleaned_meta_map.objects.erase(i->first);
- scrubber.cleaned_meta_map.objects.insert(
- *(maps[i->second.back()]->objects.find(i->first))
- );
- }
+void PG::scrub_send_unblocking(epoch_t epoch_queued,
+ [[maybe_unused]] ThreadPool::TPHandle& handle)
+{
+ dout(10) << __func__ << " queued at: " << epoch_queued << dendl;
+ if (pg_has_reset_since(epoch_queued)) {
+ dout(10) << __func__ << " been reset at "
+ << recovery_state.get_last_peering_reset() << dendl;
+ return;
}
+ m_scrubber->send_scrub_unblock();
+}
- ScrubMap for_meta_scrub;
- scrubber.clean_meta_map(for_meta_scrub);
-
- // ok, do the pg-type specific scrubbing
- scrub_snapshot_metadata(for_meta_scrub, missing_digest);
- // Called here on the primary can use an authoritative map if it isn't the primary
- _scan_snaps(for_meta_scrub);
- if (!scrubber.store->empty()) {
- if (state_test(PG_STATE_REPAIR)) {
- dout(10) << __func__ << ": discarding scrub results" << dendl;
- scrubber.store->flush(nullptr);
- } else {
- dout(10) << __func__ << ": updating scrub object" << dendl;
- ObjectStore::Transaction t;
- scrubber.store->flush(&t);
- osd->store->queue_transaction(ch, std::move(t), nullptr);
- }
- }
+void PG::scrub_send_digest_update(epoch_t epoch_queued,
+ [[maybe_unused]] ThreadPool::TPHandle& handle)
+{
+ dout(10) << __func__ << " queued at: " << epoch_queued << dendl;
+ m_scrubber->digest_update_notification();
}
-bool PG::scrub_process_inconsistent()
+void PG::scrub_send_replmaps_ready(epoch_t epoch_queued,
+ [[maybe_unused]] ThreadPool::TPHandle& handle)
{
- dout(10) << __func__ << ": checking authoritative" << dendl;
- bool repair = state_test(PG_STATE_REPAIR);
- bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB);
- const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub"));
-
- // authoriative only store objects which missing or inconsistent.
- if (!scrubber.authoritative.empty()) {
- stringstream ss;
- ss << info.pgid << " " << mode << " "
- << scrubber.missing.size() << " missing, "
- << scrubber.inconsistent.size() << " inconsistent objects";
- dout(2) << ss.str() << dendl;
- osd->clog->error(ss);
- if (repair) {
- state_clear(PG_STATE_CLEAN);
- for (map<hobject_t, list<pair<ScrubMap::object, pg_shard_t> >>::iterator i =
- scrubber.authoritative.begin();
- i != scrubber.authoritative.end();
- ++i) {
- auto missing_entry = scrubber.missing.find(i->first);
- if (missing_entry != scrubber.missing.end()) {
- repair_object(
- i->first,
- i->second,
- missing_entry->second);
- scrubber.fixed += missing_entry->second.size();
- }
- if (scrubber.inconsistent.count(i->first)) {
- repair_object(
- i->first,
- i->second,
- scrubber.inconsistent[i->first]);
- scrubber.fixed += missing_entry->second.size();
- }
- }
- }
- }
- return (!scrubber.authoritative.empty() && repair);
+ dout(10) << __func__ << " queued at: " << epoch_queued << dendl;
+ m_scrubber->send_replica_maps_ready();
}
-bool PG::ops_blocked_by_scrub() const {
+bool PG::ops_blocked_by_scrub() const
+{
return (waiting_for_scrub.size() != 0);
}
-// the part that actually finalizes a scrub
-void PG::scrub_finish()
+Scrub::scrub_prio_t PG::is_scrub_blocking_ops() const
{
- dout(20) << __func__ << dendl;
- bool repair = state_test(PG_STATE_REPAIR);
- bool do_auto_scrub = false;
- // if the repair request comes from auto-repair and large number of errors,
- // we would like to cancel auto-repair
- if (repair && scrubber.auto_repair
- && scrubber.authoritative.size() > cct->_conf->osd_scrub_auto_repair_num_errors) {
- state_clear(PG_STATE_REPAIR);
- repair = false;
- }
- bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB);
- const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub"));
-
- // if a regular scrub had errors within the limit, do a deep scrub to auto repair.
- if (scrubber.deep_scrub_on_error
- && scrubber.authoritative.size()
- && scrubber.authoritative.size() <= cct->_conf->osd_scrub_auto_repair_num_errors) {
- ceph_assert(!deep_scrub);
- do_auto_scrub = true;
- dout(20) << __func__ << " Try to auto repair after scrub errors" << dendl;
- }
- scrubber.deep_scrub_on_error = false;
-
- // type-specific finish (can tally more errors)
- _scrub_finish();
-
- bool has_error = scrub_process_inconsistent();
-
- {
- stringstream oss;
- oss << info.pgid.pgid << " " << mode << " ";
- int total_errors = scrubber.shallow_errors + scrubber.deep_errors;
- if (total_errors)
- oss << total_errors << " errors";
- else
- oss << "ok";
- if (!deep_scrub && info.stats.stats.sum.num_deep_scrub_errors)
- oss << " ( " << info.stats.stats.sum.num_deep_scrub_errors
- << " remaining deep scrub error details lost)";
- if (repair)
- oss << ", " << scrubber.fixed << " fixed";
- if (total_errors)
- osd->clog->error(oss);
- else
- osd->clog->debug(oss);
- }
-
- // Since we don't know which errors were fixed, we can only clear them
- // when every one has been fixed.
- if (repair) {
- if (scrubber.fixed == scrubber.shallow_errors + scrubber.deep_errors) {
- ceph_assert(deep_scrub);
- scrubber.shallow_errors = scrubber.deep_errors = 0;
- dout(20) << __func__ << " All may be fixed" << dendl;
- } else if (has_error) {
- // Deep scrub in order to get corrected error counts
- scrub_after_recovery = true;
- save_req_scrub = scrubber.req_scrub;
- dout(20) << __func__ << " Set scrub_after_recovery, req_scrub=" << save_req_scrub << dendl;
- } else if (scrubber.shallow_errors || scrubber.deep_errors) {
- // We have errors but nothing can be fixed, so there is no repair
- // possible.
- state_set(PG_STATE_FAILED_REPAIR);
- dout(10) << __func__ << " " << (scrubber.shallow_errors + scrubber.deep_errors)
- << " error(s) present with no repair possible" << dendl;
- }
- }
-
- {
- // finish up
- ObjectStore::Transaction t;
- recovery_state.update_stats(
- [this, deep_scrub](auto &history, auto &stats) {
- utime_t now = ceph_clock_now();
- history.last_scrub = recovery_state.get_info().last_update;
- history.last_scrub_stamp = now;
- if (scrubber.deep) {
- history.last_deep_scrub = recovery_state.get_info().last_update;
- history.last_deep_scrub_stamp = now;
- }
-
- if (deep_scrub) {
- if ((scrubber.shallow_errors == 0) && (scrubber.deep_errors == 0))
- history.last_clean_scrub_stamp = now;
- stats.stats.sum.num_shallow_scrub_errors = scrubber.shallow_errors;
- stats.stats.sum.num_deep_scrub_errors = scrubber.deep_errors;
- stats.stats.sum.num_large_omap_objects = scrubber.omap_stats.large_omap_objects;
- stats.stats.sum.num_omap_bytes = scrubber.omap_stats.omap_bytes;
- stats.stats.sum.num_omap_keys = scrubber.omap_stats.omap_keys;
- dout(25) << "scrub_finish shard " << pg_whoami << " num_omap_bytes = "
- << stats.stats.sum.num_omap_bytes << " num_omap_keys = "
- << stats.stats.sum.num_omap_keys << dendl;
- } else {
- stats.stats.sum.num_shallow_scrub_errors = scrubber.shallow_errors;
- // XXX: last_clean_scrub_stamp doesn't mean the pg is not inconsistent
- // because of deep-scrub errors
- if (scrubber.shallow_errors == 0)
- history.last_clean_scrub_stamp = now;
- }
- stats.stats.sum.num_scrub_errors =
- stats.stats.sum.num_shallow_scrub_errors +
- stats.stats.sum.num_deep_scrub_errors;
- if (scrubber.check_repair) {
- scrubber.check_repair = false;
- if (info.stats.stats.sum.num_scrub_errors) {
- state_set(PG_STATE_FAILED_REPAIR);
- dout(10) << "scrub_finish " << info.stats.stats.sum.num_scrub_errors
- << " error(s) still present after re-scrub" << dendl;
- }
- }
- return true;
- },
- &t);
- int tr = osd->store->queue_transaction(ch, std::move(t), NULL);
- ceph_assert(tr == 0);
- }
-
- if (has_error) {
- queue_peering_event(
- PGPeeringEventRef(
- std::make_shared<PGPeeringEvent>(
- get_osdmap_epoch(),
- get_osdmap_epoch(),
- PeeringState::DoRecovery())));
- }
-
- scrub_clear_state(has_error);
- scrub_unreserve_replicas();
-
- if (do_auto_scrub) {
- scrub_requested(false, false, true);
- }
-
- if (is_active() && is_primary()) {
- recovery_state.share_pg_info();
- }
+ return waiting_for_scrub.size() ? Scrub::scrub_prio_t::high_priority
+ : Scrub::scrub_prio_t::low_priority;
}
bool PG::old_peering_msg(epoch_t reply_epoch, epoch_t query_epoch)
{
- if (get_last_peering_reset() > reply_epoch ||
- get_last_peering_reset() > query_epoch) {
- dout(10) << "old_peering_msg reply_epoch " << reply_epoch << " query_epoch " << query_epoch
- << " last_peering_reset " << get_last_peering_reset()
- << dendl;
+ if (auto last_reset = get_last_peering_reset();
+ last_reset > reply_epoch || last_reset > query_epoch) {
+ dout(10) << "old_peering_msg reply_epoch " << reply_epoch << " query_epoch "
+ << query_epoch << " last_peering_reset " << last_reset << dendl;
return true;
}
return false;
ostream& operator<<(ostream& out, const PG& pg)
{
out << pg.recovery_state;
- if (pg.scrubber.must_repair)
- out << " MUST_REPAIR";
- if (pg.scrubber.auto_repair)
- out << " AUTO_REPAIR";
- if (pg.scrubber.check_repair)
- out << " CHECK_REPAIR";
- if (pg.scrubber.deep_scrub_on_error)
- out << " DEEP_SCRUB_ON_ERROR";
- if (pg.scrubber.must_deep_scrub)
- out << " MUST_DEEP_SCRUB";
- if (pg.scrubber.must_scrub)
- out << " MUST_SCRUB";
- if (pg.scrubber.time_for_deep)
- out << " TIME_FOR_DEEP";
- if (pg.scrubber.need_auto)
- out << " NEED_AUTO";
- if (pg.scrubber.req_scrub)
- out << " REQ_SCRUB";
+
+ // listing all scrub-related flags - both current and "planned next scrub"
+ if (pg.is_scrubbing()) {
+ out << *pg.m_scrubber;
+ }
+ out << pg.m_planned_scrub;
if (pg.recovery_ops_active)
out << " rops=" << pg.recovery_ops_active;
// resets the messenger sesssion when the replica reconnects. to avoid the
// out-of-order replies, the messages from that replica should be discarded.
OSDMapRef next_map = osd->get_next_osdmap();
- if (next_map->is_down(from))
+ if (next_map->is_down(from)) {
+ dout(20) << " " << __func__ << " dead for nextmap is down " << from << dendl;
return true;
+ }
/* Mostly, this overlaps with the old_peering_msg
* condition. An important exception is pushes
* sent by replicas not in the acting set, since
* if such a replica goes down it does not cause
* a new interval. */
- if (next_map->get_down_at(from) >= m->map_epoch)
+ if (next_map->get_down_at(from) >= m->map_epoch) {
+ dout(20) << " " << __func__ << " dead for 'get_down_at' " << from << dendl;
return true;
+ }
// same pg?
// if pg changes _at all_, we reset and repeer!
recovery_state.handle_event(evt, &rctx);
}
-void PG::Scrubber::dump(Formatter *f)
-{
- f->open_object_section("scrubber");
- f->dump_stream("epoch_start") << epoch_start;
- f->dump_bool("active", active);
- if (active) {
- f->dump_string("state", state_string(state));
- f->dump_stream("start") << start;
- f->dump_stream("end") << end;
- f->dump_stream("max_end") << max_end;
- f->dump_stream("subset_last_update") << subset_last_update;
- f->dump_bool("deep", deep);
- f->dump_bool("must_scrub", must_scrub);
- f->dump_bool("must_deep_scrub", must_deep_scrub);
- f->dump_bool("must_repair", must_repair);
- f->dump_bool("need_auto", need_auto);
- f->dump_bool("req_scrub", req_scrub);
- f->dump_bool("time_for_deep", time_for_deep);
- f->dump_bool("auto_repair", auto_repair);
- f->dump_bool("check_repair", check_repair);
- f->dump_bool("deep_scrub_on_error", deep_scrub_on_error);
- f->dump_stream("scrub_reg_stamp") << scrub_reg_stamp; //utime_t
- f->dump_stream("waiting_on_whom") << waiting_on_whom; //set<pg_shard_t>
- f->dump_unsigned("priority", priority);
- f->dump_int("shallow_errors", shallow_errors);
- f->dump_int("deep_errors", deep_errors);
- f->dump_int("fixed", fixed);
- {
- f->open_array_section("waiting_on_whom");
- for (set<pg_shard_t>::iterator p = waiting_on_whom.begin();
- p != waiting_on_whom.end();
- ++p) {
- f->dump_stream("shard") << *p;
- }
- f->close_section();
- }
- }
- f->close_section();
-}
void PG::handle_query_state(Formatter *f)
{
// This code has moved to after the close of recovery_state array.
// I don't think that scrub is a recovery state
- if (is_primary() && is_active()) {
- f->open_object_section("scrub");
- f->dump_stream("scrubber.epoch_start") << scrubber.epoch_start;
- f->dump_bool("scrubber.active", scrubber.active);
- f->dump_string("scrubber.state", PG::Scrubber::state_string(scrubber.state));
- f->dump_stream("scrubber.start") << scrubber.start;
- f->dump_stream("scrubber.end") << scrubber.end;
- f->dump_stream("scrubber.max_end") << scrubber.max_end;
- f->dump_stream("scrubber.subset_last_update") << scrubber.subset_last_update;
- f->dump_bool("scrubber.deep", scrubber.deep);
- {
- f->open_array_section("scrubber.waiting_on_whom");
- for (set<pg_shard_t>::iterator p = scrubber.waiting_on_whom.begin();
- p != scrubber.waiting_on_whom.end();
- ++p) {
- f->dump_stream("shard") << *p;
- }
- f->close_section();
- }
- f->dump_string("comment", "DEPRECATED - may be removed in the next release");
- f->close_section();
+ if (is_primary() && is_active() && m_scrubber->is_scrub_active()) {
+ m_scrubber->handle_query_state(f);
}
}
public:
const pg_shard_t pg_whoami;
const spg_t pg_id;
+
+ std::unique_ptr<ScrubPgIF> m_scrubber;
+
/// flags detailing scheduling/operation characteristics of the next scrub
requested_scrub_t m_planned_scrub;
+ /// scrubbing state for both Primary & replicas
+ bool is_scrub_active() const { return m_scrubber->is_scrub_active(); }
public:
// -- members --
ObjectStore::Transaction &t);
void scrub(epoch_t queued, ThreadPool::TPHandle &handle);
+ /**
+ * a special version of PG::scrub(), which:
+ * - is initiated after repair, and
+ * - is not required to allocate local/remote OSD scrub resources
+ */
+ void recovery_scrub(epoch_t queued, ThreadPool::TPHandle &handle);
+ void replica_scrub(epoch_t queued, ThreadPool::TPHandle &handle);
+ void replica_scrub_resched(epoch_t queued, ThreadPool::TPHandle &handle);
/// Queues a PGScrubResourcesOK message. Will translate into 'RemotesReserved' FSM event
void scrub_send_resources_granted(epoch_t queued, ThreadPool::TPHandle &handle);
void scrub_send_resources_denied(epoch_t queued, ThreadPool::TPHandle &handle);
+ void scrub_send_scrub_resched(epoch_t queued, ThreadPool::TPHandle &handle);
+ void scrub_send_pushes_update(epoch_t queued, ThreadPool::TPHandle &handle);
+ void scrub_send_applied_update(epoch_t queued, ThreadPool::TPHandle &handle);
+ void scrub_send_unblocking(epoch_t epoch_queued, ThreadPool::TPHandle &handle);
+ void scrub_send_digest_update(epoch_t epoch_queued, ThreadPool::TPHandle &handle);
+ void scrub_send_replmaps_ready(epoch_t epoch_queued, ThreadPool::TPHandle &handle);
+ void scrub_send_replica_pushes(epoch_t queued, ThreadPool::TPHandle &handle);
- bool is_scrub_registered();
void reg_next_scrub();
- void unreg_next_scrub();
void queue_want_pg_temp(const std::vector<int> &wanted) override;
void clear_want_pg_temp() override;
void on_info_history_change() override;
- void scrub_requested(bool deep, bool repair, bool need_auto = false) override;
+ void scrub_requested(scrub_level_t scrub_level, scrub_type_t scrub_type) override;
uint64_t get_snap_trimq_size() const override {
return snap_trimq.size();
return finish_recovery();
}
- void on_activate(interval_set<snapid_t> snaps) override {
- ceph_assert(scrubber.callbacks.empty());
- ceph_assert(callbacks_for_degraded_object.empty());
- snap_trimq = snaps;
- release_pg_backoffs();
- projected_last_update = info.last_update;
- }
+ void on_activate(interval_set<snapid_t> snaps) override;
void on_activate_committed() override;
void shutdown();
virtual void on_shutdown() = 0;
- bool get_must_scrub() const {
- return scrubber.must_scrub;
- }
+ bool get_must_scrub() const;
bool sched_scrub();
unsigned int scrub_requeue_priority(Scrub::scrub_prio_t with_priority, unsigned int suggested_priority) const;
/// the version that refers to flags_.priority
unsigned int scrub_requeue_priority(Scrub::scrub_prio_t with_priority) const;
+private:
+ // auxiliaries used by sched_scrub():
+ double next_deepscrub_interval() const;
+
+ /// should we perform deep scrub?
+ bool is_time_for_deep(bool allow_deep_scrub,
+ bool allow_scrub,
+ bool has_deep_errors,
+ const requested_scrub_t& planned) const;
+
+ /**
+ * Verify the various 'next scrub' flags in m_planned_scrub against configuration
+ * and scrub-related timestamps.
+ *
+ * @returns an updated copy of the m_planned_flags (or nothing if no scrubbing)
+ */
+ std::optional<requested_scrub_t> verify_scrub_mode() const;
+
+ bool verify_periodic_scrub_mode(bool allow_deep_scrub,
+ bool try_to_auto_repair,
+ bool allow_regular_scrub,
+ bool has_deep_errors,
+ requested_scrub_t& planned) const;
+
+public:
virtual void do_request(
OpRequestRef& op,
ThreadPool::TPHandle &handle
pg->get_pgbackend()->trim(entry, t);
}
};
-
+
void update_object_snap_mapping(
ObjectStore::Transaction *t, const hobject_t &soid,
const std::set<snapid_t> &snaps);
hobject_t end = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
release_backoffs(begin, end);
}
-protected:
// -- scrub --
-public:
- struct Scrubber {
- Scrubber();
- ~Scrubber();
-
- // metadata
- std::set<pg_shard_t> reserved_peers;
- bool local_reserved, remote_reserved, reserve_failed;
- epoch_t epoch_start;
-
- // common to both scrubs
- bool active;
- std::set<pg_shard_t> waiting_on_whom;
- int shallow_errors;
- int deep_errors;
- int fixed;
- ScrubMap primary_scrubmap;
- ScrubMapBuilder primary_scrubmap_pos;
- epoch_t replica_scrub_start = 0;
- ScrubMap replica_scrubmap;
- ScrubMapBuilder replica_scrubmap_pos;
- std::map<pg_shard_t, ScrubMap> received_maps;
- OpRequestRef active_rep_scrub;
- utime_t scrub_reg_stamp; // stamp we registered for
-
- static utime_t scrub_must_stamp() { return utime_t(0,1); }
-
- omap_stat_t omap_stats = (const struct omap_stat_t){ 0 };
-
- // For async sleep
- bool sleeping = false;
- bool needs_sleep = true;
- utime_t sleep_start;
-
- // flags to indicate explicitly requested scrubs (by admin)
- bool must_scrub, must_deep_scrub, must_repair, need_auto, req_scrub;
-
- // Priority to use for scrub scheduling
- unsigned priority = 0;
-
- bool time_for_deep;
- // this flag indicates whether we would like to do auto-repair of the PG or not
- bool auto_repair;
- // this flag indicates that we are scrubbing post repair to verify everything is fixed
- bool check_repair;
- // this flag indicates that if a regular scrub detects errors <= osd_scrub_auto_repair_num_errors,
- // we should deep scrub in order to auto repair
- bool deep_scrub_on_error;
-
- // Maps from objects with errors to missing/inconsistent peers
- std::map<hobject_t, std::set<pg_shard_t>> missing;
- std::map<hobject_t, std::set<pg_shard_t>> inconsistent;
-
- // Std::map from object with errors to good peers
- std::map<hobject_t, std::list<std::pair<ScrubMap::object, pg_shard_t> >> authoritative;
-
- // Cleaned std::map pending snap metadata scrub
- ScrubMap cleaned_meta_map;
-
- void clean_meta_map(ScrubMap &for_meta_scrub) {
- if (end.is_max() ||
- cleaned_meta_map.objects.empty()) {
- cleaned_meta_map.swap(for_meta_scrub);
- } else {
- auto iter = cleaned_meta_map.objects.end();
- --iter; // not empty, see if clause
- auto begin = cleaned_meta_map.objects.begin();
- if (iter->first.has_snapset()) {
- ++iter;
- } else {
- while (iter != begin) {
- auto next = iter--;
- if (next->first.get_head() != iter->first.get_head()) {
- ++iter;
- break;
- }
- }
- }
- for_meta_scrub.objects.insert(begin, iter);
- cleaned_meta_map.objects.erase(begin, iter);
- }
- }
-
- // digest updates which we are waiting on
- int num_digest_updates_pending;
-
- // chunky scrub
- hobject_t start, end; // [start,end)
- hobject_t max_end; // Largest end that may have been sent to replicas
- eversion_t subset_last_update;
-
- // chunky scrub state
- enum State {
- INACTIVE,
- NEW_CHUNK,
- WAIT_PUSHES,
- WAIT_LAST_UPDATE,
- BUILD_MAP,
- BUILD_MAP_DONE,
- WAIT_REPLICAS,
- COMPARE_MAPS,
- WAIT_DIGEST_UPDATES,
- FINISH,
- BUILD_MAP_REPLICA,
- } state;
-
- std::unique_ptr<Scrub::Store> store;
- // deep scrub
- bool deep;
- int preempt_left;
- int preempt_divisor;
-
- std::list<Context*> callbacks;
- void add_callback(Context *context) {
- callbacks.push_back(context);
- }
- void run_callbacks() {
- std::list<Context*> to_run;
- to_run.swap(callbacks);
- for (std::list<Context*>::iterator i = to_run.begin();
- i != to_run.end();
- ++i) {
- (*i)->complete(0);
- }
- }
-
- static const char *state_string(const PG::Scrubber::State& state) {
- const char *ret = NULL;
- switch( state )
- {
- case INACTIVE: ret = "INACTIVE"; break;
- case NEW_CHUNK: ret = "NEW_CHUNK"; break;
- case WAIT_PUSHES: ret = "WAIT_PUSHES"; break;
- case WAIT_LAST_UPDATE: ret = "WAIT_LAST_UPDATE"; break;
- case BUILD_MAP: ret = "BUILD_MAP"; break;
- case BUILD_MAP_DONE: ret = "BUILD_MAP_DONE"; break;
- case WAIT_REPLICAS: ret = "WAIT_REPLICAS"; break;
- case COMPARE_MAPS: ret = "COMPARE_MAPS"; break;
- case WAIT_DIGEST_UPDATES: ret = "WAIT_DIGEST_UPDATES"; break;
- case FINISH: ret = "FINISH"; break;
- case BUILD_MAP_REPLICA: ret = "BUILD_MAP_REPLICA"; break;
- }
- return ret;
- }
-
- bool is_chunky_scrub_active() const { return state != INACTIVE; }
-
- // clear all state
- void reset() {
- active = false;
- waiting_on_whom.clear();
- if (active_rep_scrub) {
- active_rep_scrub = OpRequestRef();
- }
- received_maps.clear();
-
- must_scrub = false;
- must_deep_scrub = false;
- must_repair = false;
- need_auto = false;
- req_scrub = false;
- time_for_deep = false;
- auto_repair = false;
- check_repair = false;
- deep_scrub_on_error = false;
-
- state = PG::Scrubber::INACTIVE;
- start = hobject_t();
- end = hobject_t();
- max_end = hobject_t();
- subset_last_update = eversion_t();
- shallow_errors = 0;
- deep_errors = 0;
- fixed = 0;
- omap_stats = (const struct omap_stat_t){ 0 };
- deep = false;
- run_callbacks();
- inconsistent.clear();
- missing.clear();
- authoritative.clear();
- num_digest_updates_pending = 0;
- primary_scrubmap = ScrubMap();
- primary_scrubmap_pos.reset();
- replica_scrubmap = ScrubMap();
- replica_scrubmap_pos.reset();
- cleaned_meta_map = ScrubMap();
- sleeping = false;
- needs_sleep = true;
- sleep_start = utime_t();
- }
-
- void create_results(const hobject_t& obj);
- void cleanup_store(ObjectStore::Transaction *t);
- void dump(ceph::Formatter *f);
- } scrubber;
-
protected:
bool scrub_after_recovery;
- bool save_req_scrub; // Saved for scrub_after_recovery
int active_pushes;
- bool scrub_can_preempt = false;
- bool scrub_preempted = false;
-
- // we allow some number of preemptions of the scrub, which mean we do
- // not block. then we start to block. once we start blocking, we do
- // not stop until the scrub range is completed.
- bool write_blocked_by_scrub(const hobject_t &soid);
-
- /// true if the given range intersects the scrub interval in any way
- bool range_intersects_scrub(const hobject_t &start, const hobject_t& end);
-
void repair_object(
const hobject_t &soid,
const std::list<std::pair<ScrubMap::object, pg_shard_t> > &ok_peers,
const std::set<pg_shard_t> &bad_peers);
- void abort_scrub();
- void chunky_scrub(ThreadPool::TPHandle &handle);
- void scrub_compare_maps();
- /**
- * return true if any inconsistency/missing is repaired, false otherwise
- */
- bool scrub_process_inconsistent();
- bool ops_blocked_by_scrub() const;
- void scrub_finish();
- void scrub_clear_state(bool keep_repair = false);
- void _scan_snaps(ScrubMap &map);
+ [[nodiscard]] bool ops_blocked_by_scrub() const;
+ [[nodiscard]] Scrub::scrub_prio_t is_scrub_blocking_ops() const;
+
void _repair_oinfo_oid(ScrubMap &map);
void _scan_rollback_obs(const std::vector<ghobject_t> &rollback_obs);
- void _request_scrub_map(pg_shard_t replica, eversion_t version,
- hobject_t start, hobject_t end, bool deep,
- bool allow_preemption);
- int build_scrub_map_chunk(
- ScrubMap &map,
- ScrubMapBuilder &pos,
- hobject_t start, hobject_t end, bool deep,
- ThreadPool::TPHandle &handle);
/**
* returns true if [begin, end) is good to scrub at this time
* a false return value obliges the implementer to requeue scrub when the
*/
virtual bool _range_available_for_scrub(
const hobject_t &begin, const hobject_t &end) = 0;
- virtual void scrub_snapshot_metadata(
- ScrubMap &map,
- const std::map<hobject_t,
- std::pair<std::optional<uint32_t>,
- std::optional<uint32_t>>> &missing_digest) { }
- virtual void _scrub_clear_state() { }
- virtual void _scrub_finish() { }
- void clear_scrub_reserved();
- void scrub_reserve_replicas();
- void scrub_unreserve_replicas();
- bool scrub_all_replicas_reserved() const;
-
- void replica_scrub(
- OpRequestRef op,
- ThreadPool::TPHandle &handle);
- void do_replica_scrub_map(OpRequestRef op);
-
- void handle_scrub_reserve_request(OpRequestRef op);
- void handle_scrub_reserve_grant(OpRequestRef op, pg_shard_t from);
- void handle_scrub_reserve_reject(OpRequestRef op, pg_shard_t from);
- void handle_scrub_reserve_release(OpRequestRef op);
+
+ /**
+ * Initiate the process that will create our scrub map for the Primary.
+ * (triggered by MSG_OSD_REP_SCRUB)
+ */
+ void replica_scrub(OpRequestRef op, ThreadPool::TPHandle &handle);
// -- recovery state --
bool is_clean() const { return recovery_state.is_clean(); }
bool is_degraded() const { return recovery_state.is_degraded(); }
bool is_undersized() const { return recovery_state.is_undersized(); }
- bool is_scrubbing() const { return state_test(PG_STATE_SCRUBBING); }
+ bool is_scrubbing() const { return state_test(PG_STATE_SCRUBBING); } // Primary only
bool is_remapped() const { return recovery_state.is_remapped(); }
bool is_peered() const { return recovery_state.is_peered(); }
bool is_recovering() const { return recovery_state.is_recovering(); }
virtual void kick_snap_trim() = 0;
virtual void snap_trimmer_scrub_complete() = 0;
- bool requeue_scrub(bool high_priority = false);
+
void queue_recovery();
- bool queue_scrub();
- unsigned get_scrub_priority();
+ void queue_scrub_after_repair();
+ unsigned int get_scrub_priority();
bool try_flush_or_schedule_async() override;
void start_flush_on_transaction(
/// Notify that info/history changed (generally to update scrub registration)
virtual void on_info_history_change() = 0;
/// Notify that a scrub has been requested
- virtual void scrub_requested(bool deep, bool repair, bool need_auto = false) = 0;
+ virtual void scrub_requested(scrub_level_t scrub_level, scrub_type_t scrub_type) = 0;
/// Return current snap_trimq size
virtual uint64_t get_snap_trimq_size() const = 0;
};
struct RequestScrub : boost::statechart::event<RequestScrub> {
- bool deep;
- bool repair;
- explicit RequestScrub(bool d, bool r) : deep(d), repair(r) {}
+ scrub_level_t deep;
+ scrub_type_t repair;
+ explicit RequestScrub(bool d, bool r) : deep(scrub_level_t(d)), repair(scrub_type_t(r)) {}
void print(std::ostream *out) const {
- *out << "RequestScrub(" << (deep ? "deep" : "shallow")
- << (repair ? " repair" : "");
+ *out << "RequestScrub(" << ((deep==scrub_level_t::deep) ? "deep" : "shallow")
+ << ((repair==scrub_type_t::do_repair) ? " repair)" : ")");
}
};
#include "boost/tuple/tuple.hpp"
#include "boost/intrusive_ptr.hpp"
#include "PG.h"
+#include "pg_scrubber.h"
#include "PrimaryLogPG.h"
#include "OSD.h"
#include "OpRequest.h"
if (r != 0) {
derr << "Error opening class '" << class_name << "': "
<< cpp_strerror(r) << dendl;
- if (r != -EPERM) // propogate permission error
+ if (r != -EPERM) // propagate permission error
r = -EINVAL;
return { r, nullptr };
} else {
f->close_section();
if (is_primary() && is_active()) {
- scrubber.dump(f.get());
+ m_scrubber->dump(f.get());
}
f->open_object_section("agent_state");
dout(10) << " corrupted scrub_ls_arg_t" << dendl;
return -EINVAL;
}
+
int r = 0;
scrub_ls_result_t result = {.interval = info.history.same_interval_since};
+
if (arg.interval != 0 && arg.interval != info.history.same_interval_since) {
r = -EAGAIN;
- } else if (!scrubber.store) {
- r = -ENOENT;
- } else if (arg.get_snapsets) {
- result.vals = scrubber.store->get_snap_errors(get_pgid().pool(),
- arg.start_after,
- arg.max_return);
} else {
- result.vals = scrubber.store->get_object_errors(get_pgid().pool(),
- arg.start_after,
- arg.max_return);
+ bool store_queried = m_scrubber->get_store_errors(arg, result);
+ if (!store_queried) {
+ // the scrubber's store is not initialized
+ r = -ENOENT;
+ }
}
- encode(result, osd_op->outdata);
+ encode(result, osd_op->outdata); // RRR really? even if no store?
+
return r;
}
+}
PrimaryLogPG::PrimaryLogPG(OSDService *o, OSDMapRef curmap,
const PGPool &_pool,
new_backfill(false),
temp_seq(0),
snap_trimmer_machine(this)
-{
+{
recovery_state.set_backend_predicates(
pgbackend->get_is_readable_predicate(),
pgbackend->get_is_recoverable_predicate());
snap_trimmer_machine.initiate();
+
+ m_scrubber = make_unique<PgScrubber>(this); // *not* the final code
+ // next commit: m_scrubber = make_unique<PrimaryLogScrub>(this);
}
void PrimaryLogPG::get_src_oloc(const object_t& oid, const object_locator_t& oloc, object_locator_t& src_oloc)
auto m = op->get_req<MOSDScrubReserve>();
switch (m->type) {
case MOSDScrubReserve::REQUEST:
- handle_scrub_reserve_request(op);
+ m_scrubber->handle_scrub_reserve_request(op);
break;
case MOSDScrubReserve::GRANT:
- handle_scrub_reserve_grant(op, m->from);
+ m_scrubber->handle_scrub_reserve_grant(op, m->from);
break;
case MOSDScrubReserve::REJECT:
- handle_scrub_reserve_reject(op, m->from);
+ m_scrubber->handle_scrub_reserve_reject(op, m->from);
break;
case MOSDScrubReserve::RELEASE:
- handle_scrub_reserve_release(op);
+ m_scrubber->handle_scrub_reserve_release(op);
break;
}
}
return;
}
- if (scrubber.is_chunky_scrub_active() && write_blocked_by_scrub(head)) {
+ if (m_scrubber->is_scrub_active() && m_scrubber->write_blocked_by_scrub(head)) {
dout(20) << __func__ << ": waiting for scrub" << dendl;
waiting_for_scrub.push_back(op);
op->mark_delayed("waiting for scrub");
return cache_result_t::BLOCKED_RECOVERY;
}
- if (write_blocked_by_scrub(head)) {
+ if (m_scrubber->write_blocked_by_scrub(head)) {
dout(20) << __func__ << ": waiting for scrub" << dendl;
waiting_for_scrub.push_back(op);
op->mark_delayed("waiting for scrub");
{
hobject_t hoid = obc ? obc->obs.oi.soid : missing_oid;
ceph_assert(hoid != hobject_t());
- if (write_blocked_by_scrub(hoid)) {
+ if (m_scrubber->write_blocked_by_scrub(hoid)) {
dout(10) << __func__ << " " << hoid
<< " blocked by scrub" << dendl;
if (op) {
}
}
- if (is_primary() && scrubber.active) {
- if (soid < scrubber.start) {
- dout(20) << __func__ << " " << soid << " < [" << scrubber.start
- << "," << scrubber.end << ")" << dendl;
- scrub_cstat.add(delta_stats);
- } else {
- dout(20) << __func__ << " " << soid << " >= [" << scrubber.start
- << "," << scrubber.end << ")" << dendl;
- }
- }
+ m_scrubber->stats_of_handled_objects(delta_stats, soid);
}
void PrimaryLogPG::complete_read_ctx(int result, OpContext *ctx)
}
if (!fop->blocking &&
- write_blocked_by_scrub(oid)) {
+ m_scrubber->write_blocked_by_scrub(oid)) {
if (fop->op) {
dout(10) << __func__ << " blocked by scrub" << dendl;
requeue_op(fop->op);
ceph_assert(applied_version != eversion_t());
ceph_assert(applied_version <= info.last_update);
recovery_state.local_write_applied(applied_version);
- if (is_primary()) {
- if (scrubber.active) {
- if (recovery_state.get_last_update_applied() >=
- scrubber.subset_last_update) {
- requeue_scrub(ops_blocked_by_scrub());
- }
- } else {
- ceph_assert(scrubber.start == scrubber.end);
- }
+
+ if (is_primary() && m_scrubber->should_requeue_blocked_ops(recovery_state.get_last_update_applied())) {
+ osd->queue_scrub_applied_update(this, is_scrub_blocking_ops());
}
}
return;
}
- if (write_blocked_by_scrub(obc->obs.oi.soid)) {
+ if (m_scrubber->write_blocked_by_scrub(obc->obs.oi.soid)) {
dout(10) << "handle_watch_timeout waiting for scrub on obj "
<< obc->obs.oi.soid
<< dendl;
- scrubber.add_callback(
+ m_scrubber->add_callback(
watch->get_delayed_cb() // This callback!
);
return;
}
if (obc->requeue_scrub_on_unblock) {
+
obc->requeue_scrub_on_unblock = false;
+
+ dout(20) << __func__ << " requeuing if still active: " << (is_active() ? "yes" : "no") << dendl;
+
// only requeue if we are still active: we may be unblocking
// because we are resetting for a new peering interval
if (is_active()) {
- requeue_scrub();
+ osd->queue_scrub_unblocking(this, is_scrub_blocking_ops());
}
}
}
--active_pushes;
// requeue an active chunky scrub waiting on recovery ops
- if (!recovery_state.is_deleting() && active_pushes == 0
- && scrubber.is_chunky_scrub_active()) {
- requeue_scrub(ops_blocked_by_scrub());
+ if (!recovery_state.is_deleting() && active_pushes == 0 &&
+ m_scrubber->is_scrub_active()) {
+
+ osd->queue_scrub_pushes_update(this, is_scrub_blocking_ops());
}
}
ceph_assert(active_pushes >= 1);
--active_pushes;
- // requeue an active chunky scrub waiting on recovery ops
+ // requeue an active scrub waiting on recovery ops
if (!recovery_state.is_deleting() && active_pushes == 0 &&
- scrubber.active_rep_scrub && static_cast<const MOSDRepScrub*>(
- scrubber.active_rep_scrub->get_req())->chunky) {
- auto& op = scrubber.active_rep_scrub;
- osd->enqueue_back(
- OpSchedulerItem(
- unique_ptr<OpSchedulerItem::OpQueueable>(new PGOpItem(info.pgid, op)),
- op->get_req()->get_cost(),
- op->get_req()->get_priority(),
- op->get_req()->get_recv_stamp(),
- op->get_req()->get_source().num(),
- get_osdmap_epoch()));
- scrubber.active_rep_scrub.reset();
+ m_scrubber->is_scrub_active()) {
+
+ osd->queue_scrub_replica_pushes(this, m_scrubber->replica_op_priority());
}
}
osd->clear_queued_recovery(this);
}
- clear_scrub_reserved();
- scrub_clear_state();
+ m_scrubber->scrub_clear_state();
- unreg_next_scrub();
+ m_scrubber->unreg_next_scrub();
vector<ceph_tid_t> tids;
cancel_copy_ops(false, &tids);
requeue_ops(waiting_for_active);
requeue_ops(waiting_for_readable);
- clear_scrub_reserved();
+ m_scrubber->clear_scrub_reservations();
vector<ceph_tid_t> tids;
cancel_copy_ops(is_primary(), &tids);
}
// requeues waiting_for_scrub
- scrub_clear_state();
+ m_scrubber->scrub_clear_state();
for (auto p = waiting_for_blocked_object.begin();
p != waiting_for_blocked_object.end();
context_registry_on_change();
pgbackend->on_change_cleanup(&t);
- scrubber.cleanup_store(&t);
+ m_scrubber->cleanup_store(&t);
pgbackend->on_change();
// clear snap_trimmer state
#ifdef DEBUG_RECOVERY_OIDS
recovering_oids.clear();
#endif
+ dout(15) << __func__ << " flags: " << m_planned_scrub << dendl;
+
last_backfill_started = hobject_t();
set<hobject_t>::iterator i = backfills_in_flight.begin();
while (i != backfills_in_flight.end()) {
// Once we hit a degraded object just skip
if (is_degraded_or_backfilling_object(aoid))
return;
- if (write_blocked_by_scrub(aoid))
+ if (m_scrubber->write_blocked_by_scrub(aoid))
return;
}
// Once we hit a degraded object just skip further trim
if (is_degraded_or_backfilling_object(aoid))
return;
- if (write_blocked_by_scrub(aoid))
+ if (m_scrubber->write_blocked_by_scrub(aoid))
return;
}
new_hset.using_gmt);
// If the current object is degraded we skip this persist request
- if (write_blocked_by_scrub(oid))
+ if (m_scrubber->write_blocked_by_scrub(oid))
return;
hit_set->seal();
osd->logger->inc(l_osd_agent_skip);
continue;
}
- if (range_intersects_scrub(obc->obs.oi.soid,
+ if (m_scrubber->range_intersects_scrub(obc->obs.oi.soid,
obc->obs.oi.soid.get_head())) {
dout(20) << __func__ << " skip (scrubbing) " << obc->obs.oi << dendl;
osd->logger->inc(l_osd_agent_skip);
return false;
}
// This is already checked by agent_work() which passes after_flush = false
- if (after_flush && range_intersects_scrub(soid, soid.get_head())) {
+ if (after_flush && m_scrubber->range_intersects_scrub(soid, soid.get_head())) {
dout(20) << __func__ << " skip (scrubbing) " << obc->obs.oi << dendl;
return false;
}
// ==========================================================================================
// SCRUB
+void PrimaryLogPG::do_replica_scrub_map(OpRequestRef op)
+{
+ dout(15) << __func__ << " is scrub active? " << m_scrubber->is_scrub_active() << dendl;
+ op->mark_started();
+
+ if (!m_scrubber->is_scrub_active()) {
+ dout(10) << __func__ << " scrub isn't active" << dendl;
+ return;
+ }
+ m_scrubber->map_from_replica(op);
+}
-bool PrimaryLogPG::_range_available_for_scrub(
- const hobject_t &begin, const hobject_t &end)
+bool PrimaryLogPG::_range_available_for_scrub(const hobject_t& begin,
+ const hobject_t& end)
{
pair<hobject_t, ObjectContextRef> next;
next.second = object_contexts.lookup(begin);
ldout(pg->cct, 10) << "NotTrimming not clean or nothing to trim" << dendl;
return discard_event();
}
- if (pg->scrubber.active) {
+ if (pg->m_scrubber->is_scrub_active()) {
ldout(pg->cct, 10) << " scrubbing, will requeue snap_trimmer after" << dendl;
return transit< WaitScrub >();
} else {
return osd->check_failsafe_full(get_dpp());
}
+bool PrimaryLogPG::maybe_preempt_replica_scrub(const hobject_t& oid)
+{
+ return m_scrubber->write_blocked_by_scrub(oid);
+}
void intrusive_ptr_add_ref(PrimaryLogPG *pg) { pg->get("intptr"); }
void intrusive_ptr_release(PrimaryLogPG *pg) { pg->put("intptr"); }
OstreamTemp clog_error() override { return osd->clog->error(); }
OstreamTemp clog_warn() override { return osd->clog->warn(); }
+ /**
+ * a scrub-map arrived from a replica
+ */
+ void do_replica_scrub_map(OpRequestRef op);
+
struct watch_disconnect_t {
uint64_t cookie;
entity_name_t name;
* Releases locks
*
* @param manager [in] manager with locks to release
+ *
+ * (moved to .cc due to scrubber access)
*/
- void release_object_locks(
- ObcLockManager &lock_manager) {
- std::list<std::pair<ObjectContextRef, std::list<OpRequestRef> > > to_req;
- bool requeue_recovery = false;
- bool requeue_snaptrim = false;
- lock_manager.put_locks(
- &to_req,
- &requeue_recovery,
- &requeue_snaptrim);
- if (requeue_recovery)
- queue_recovery();
- if (requeue_snaptrim)
- snap_trimmer_machine.process_event(TrimWriteUnblocked());
-
- if (!to_req.empty()) {
- // requeue at front of scrub blocking queue if we are blocked by scrub
- for (auto &&p: to_req) {
- if (write_blocked_by_scrub(p.first->obs.oi.soid.get_head())) {
- for (auto& op : p.second) {
- op->mark_delayed("waiting for scrub");
- }
-
- waiting_for_scrub.splice(
- waiting_for_scrub.begin(),
- p.second,
- p.second.begin(),
- p.second.end());
- } else if (is_laggy()) {
- for (auto& op : p.second) {
- op->mark_delayed("waiting for readable");
- }
- waiting_for_readable.splice(
- waiting_for_readable.begin(),
- p.second,
- p.second.begin(),
- p.second.end());
- } else {
- requeue_ops(p.second);
- }
- }
- }
- }
+ void release_object_locks(ObcLockManager &lock_manager);
// replica ops
// [primary|tail]
void on_removal(ObjectStore::Transaction &t) override;
void on_shutdown() override;
bool check_failsafe_full() override;
- bool maybe_preempt_replica_scrub(const hobject_t& oid) override {
- return write_blocked_by_scrub(oid);
- }
+ bool maybe_preempt_replica_scrub(const hobject_t& oid) override;
int rep_repair_primary_object(const hobject_t& soid, OpContext *ctx);
// attr cache handling
return t->gen_prefix(*_dout) << " scrubber pg(" << t->pg_id << ") ";
}
+ostream& operator<<(ostream& out, const scrub_flags_t& sf)
+{
+ if (sf.auto_repair)
+ out << " AUTO_REPAIR";
+ if (sf.check_repair)
+ out << " CHECK_REPAIR";
+ if (sf.deep_scrub_on_error)
+ out << " DEEP_SCRUB_ON_ERROR";
+ if (sf.required)
+ out << " REQ_SCRUB";
+
+ return out;
+}
+
ostream& operator<<(ostream& out, const requested_scrub_t& sf)
{
if (sf.must_repair)
return out;
}
+bool PgScrubber::is_event_relevant(epoch_t queued) const
+{
+ return is_primary() && m_pg->is_active() && m_pg->is_clean() && is_scrub_active() &&
+ !was_epoch_changed() && (!queued || !m_pg->pg_has_reset_since(queued));
+}
+
+bool PgScrubber::should_abort_scrub(epoch_t queued) const
+{
+ dout(10) << __func__ << "(): queued:" << queued << " required: " << m_flags.required
+ << " noscrub: " << get_osdmap()->test_flag(CEPH_OSDMAP_NOSCRUB) << " / "
+ << m_pg->pool.info.has_flag(pg_pool_t::FLAG_NOSCRUB) << dendl;
+
+ if (!is_primary() || !m_pg->is_active() ||
+ (queued && m_pg->pg_has_reset_since(queued))) {
+ return true;
+ }
+
+ if (m_flags.required) {
+ return false; // not stopping 'required' scrubs for configuration changes
+ }
+
+ if (state_test(PG_STATE_DEEP_SCRUB)) {
+ if (get_osdmap()->test_flag(CEPH_OSDMAP_NODEEP_SCRUB) ||
+ m_pg->pool.info.has_flag(pg_pool_t::FLAG_NODEEP_SCRUB)) {
+ dout(10) << "nodeep_scrub set, aborting" << dendl;
+ return true;
+ }
+ } else if (state_test(PG_STATE_SCRUBBING)) {
+ if (get_osdmap()->test_flag(CEPH_OSDMAP_NOSCRUB) ||
+ m_pg->pool.info.has_flag(pg_pool_t::FLAG_NOSCRUB)) {
+ dout(10) << "noscrub set, aborting" << dendl;
+ return true;
+ }
+ }
+
+ return false;
+}
+
+void PgScrubber::send_start_scrub()
+{
+ dout(10) << "scrubber event -->> " << __func__ << dendl;
+ if (should_abort_scrub(epoch_t(0))) {
+ dout(10) << __func__ << " aborting!" << dendl;
+ scrub_clear_state(false);
+ } else {
+ m_fsm->my_states();
+ m_fsm->process_event(StartScrub{});
+ }
+ dout(10) << "scrubber event --<< " << __func__ << dendl;
+}
+
+void PgScrubber::send_start_after_repair()
+{
+ dout(10) << "scrubber event -->> " << __func__ << dendl;
+ m_fsm->my_states();
+ m_fsm->process_event(AfterRepairScrub{});
+ dout(10) << "scrubber event --<< " << __func__ << dendl;
+}
+
+void PgScrubber::send_scrub_unblock()
+{
+ dout(10) << "scrubber event -->> " << __func__ << dendl;
+ if (should_abort_scrub(epoch_t(0))) {
+
+ dout(10) << __func__ << " aborting!" << dendl;
+ scrub_clear_state(false);
+
+ } else if (is_scrub_active()) {
+
+ m_fsm->my_states();
+ m_fsm->process_event(Unblocked{});
+
+ } else {
+ dout(10) << __func__ << " ignored as scrub not active" << dendl;
+ }
+ dout(10) << "scrubber event --<< " << __func__ << dendl;
+}
+
+void PgScrubber::send_scrub_resched()
+{
+ dout(10) << "scrubber event -->> " << __func__ << dendl;
+ if (should_abort_scrub(epoch_t(0))) {
+ dout(10) << __func__ << " aborting!" << dendl;
+ scrub_clear_state(false);
+ } else if (is_scrub_active()) {
+ m_fsm->my_states();
+ m_fsm->process_event(InternalSchedScrub{});
+ } else {
+ // no need to send anything
+ dout(10) << __func__ << " event no longer relevant" << dendl;
+ }
+ dout(10) << "scrubber event --<< " << __func__ << dendl;
+}
+
+void PgScrubber::send_start_replica()
+{
+ dout(10) << "scrubber event -->> " << __func__ << dendl;
+ m_fsm->my_states();
+ m_fsm->process_event(StartReplica{});
+ dout(10) << "scrubber event --<< " << __func__ << dendl;
+}
+
+void PgScrubber::send_sched_replica()
+{
+ dout(10) << "scrubber event -->> " << __func__ << dendl;
+ m_fsm->my_states();
+ m_fsm->process_event(SchedReplica{}); // retest for map availability
+ dout(10) << "scrubber event --<< " << __func__ << dendl;
+}
+
+void PgScrubber::active_pushes_notification()
+{
+ dout(10) << "scrubber event -->> " << __func__ << dendl;
+ if (should_abort_scrub(epoch_t(0))) {
+ dout(10) << __func__ << " aborting!" << dendl;
+ scrub_clear_state(false);
+ } else {
+ m_fsm->my_states();
+ m_fsm->process_event(ActivePushesUpd{});
+ }
+ dout(10) << "scrubber event --<< " << __func__ << dendl;
+}
+
+void PgScrubber::update_applied_notification(epoch_t epoch_queued)
+{
+ dout(10) << "scrubber event -->> " << __func__ << "() epoch: " << epoch_queued << dendl;
+ if (should_abort_scrub(epoch_queued)) {
+ dout(10) << __func__ << " aborting!" << dendl;
+ scrub_clear_state(false);
+ } else {
+ m_fsm->my_states();
+ m_fsm->process_event(UpdatesApplied{});
+ }
+ dout(10) << "scrubber event --<< " << __func__ << dendl;
+}
+
+void PgScrubber::digest_update_notification()
+{
+ dout(10) << "scrubber event -->> " << __func__ << dendl;
+ m_fsm->my_states();
+ if (is_event_relevant(epoch_t(0))) {
+ m_fsm->process_event(DigestUpdate{});
+ } else {
+ // no need to send anything
+ dout(10) << __func__ << " event no longer relevant" << dendl;
+ }
+ dout(10) << "scrubber event --<< " << __func__ << dendl;
+}
+
+void PgScrubber::send_epoch_changed()
+{
+ dout(10) << "scrubber event -->> " << __func__ << dendl;
+ if (is_scrub_active()) {
+ m_fsm->my_states();
+ m_fsm->process_event(EpochChanged{});
+ }
+ dout(10) << "scrubber event --<< " << __func__ << dendl;
+}
+
+void PgScrubber::send_replica_maps_ready()
+{
+ dout(10) << "scrubber event -->> " << __func__ << dendl;
+ m_fsm->my_states();
+ if (is_scrub_active()) {
+ m_fsm->process_event(GotReplicas{});
+ }
+ dout(10) << "scrubber event --<< " << __func__ << dendl;
+}
+
+void PgScrubber::send_replica_pushes_upd()
+{
+ dout(10) << "scrubber event -->> " << __func__ << dendl;
+ m_fsm->my_states();
+ if (is_scrub_active()) {
+ m_fsm->process_event(ReplicaPushesUpd{});
+ }
+ dout(10) << "scrubber event --<< " << __func__ << dendl;
+}
+
+void PgScrubber::send_remotes_reserved()
+{
+ dout(10) << "scrubber event -->> " << __func__ << dendl;
+ m_fsm->my_states();
+ m_fsm->process_event(RemotesReserved{}); // note: too early to check for 'active'!
+ dout(10) << "scrubber event --<< " << __func__ << dendl;
+}
+
+void PgScrubber::send_reservation_failure()
+{
+ dout(10) << "scrubber event -->> " << __func__ << dendl;
+ m_fsm->my_states();
+ m_fsm->process_event(ReservationFailure{}); // do not check for 'active'!
+ dout(10) << "scrubber event --<< " << __func__ << dendl;
+}
+
+bool PgScrubber::is_scrub_active() const
+{
+ dout(10) << " " << __func__ << " actv? " << m_active << "pg:" << m_pg->pg_id << dendl;
+ return m_active;
+}
+
+bool PgScrubber::is_reserving() const
+{
+ return m_fsm->is_reserving();
+}
+
+void PgScrubber::reset_epoch(epoch_t epoch_queued)
+{
+ dout(10) << __func__ << " PG( " << m_pg->pg_id
+ << (m_pg->is_primary() ? ") prm" : ") rpl") << " epoch: " << epoch_queued
+ << " state deep? " << state_test(PG_STATE_DEEP_SCRUB) << dendl;
+
+ dout(10) << __func__ << " STATE_SCRUBBING? " << state_test(PG_STATE_SCRUBBING) << dendl;
+ m_epoch_queued = epoch_queued;
+ m_needs_sleep = true;
+
+ m_fsm->assert_not_active();
+
+ m_is_deep = state_test(PG_STATE_DEEP_SCRUB);
+}
+
+unsigned int PgScrubber::scrub_requeue_priority(Scrub::scrub_prio_t with_priority) const
+{
+ unsigned int qu_priority = m_flags.priority;
+
+ if (with_priority == Scrub::scrub_prio_t::high_priority) {
+ qu_priority =
+ std::max(qu_priority, (unsigned int)m_pg->cct->_conf->osd_client_op_priority);
+ }
+ return qu_priority;
+}
+
+unsigned int PgScrubber::scrub_requeue_priority(Scrub::scrub_prio_t with_priority,
+ unsigned int suggested_priority) const
+{
+ if (with_priority == Scrub::scrub_prio_t::high_priority) {
+ suggested_priority = std::max(suggested_priority,
+ (unsigned int)m_pg->cct->_conf->osd_client_op_priority);
+ }
+ return suggested_priority;
+}
+
+// ///////////////////////////////////////////////////////////////////// //
+// scrub op registration handling
+
+bool PgScrubber::is_scrub_registered() const
+{
+ return !m_scrub_reg_stamp.is_zero();
+}
+
+void PgScrubber::reg_next_scrub(const requested_scrub_t& request_flags)
+{
+ if (!is_primary()) {
+ dout(20) << __func__ << ": not a primary!" << dendl;
+ return;
+ }
+
+ dout(10) << __func__ << " planned.m.s: " << request_flags.must_scrub
+ << ": planned.n.a.: " << request_flags.need_auto
+ << " stamp: " << m_pg->info.history.last_scrub_stamp << dendl;
+
+ ceph_assert(!is_scrub_registered());
+
+ utime_t reg_stamp;
+ bool must = false;
+
+ if (request_flags.must_scrub || request_flags.need_auto) {
+ // Set the smallest time that isn't utime_t()
+ reg_stamp = PgScrubber::scrub_must_stamp();
+ must = true;
+ } else if (m_pg->info.stats.stats_invalid &&
+ m_pg->cct->_conf->osd_scrub_invalid_stats) {
+ reg_stamp = ceph_clock_now();
+ must = true;
+ } else {
+ reg_stamp = m_pg->info.history.last_scrub_stamp;
+ }
+
+ dout(9) << __func__ << " pg(" << m_pg_id << ") must: " << must
+ << " required:" << m_flags.required << " flags: " << request_flags
+ << " stamp: " << reg_stamp << dendl;
+
+ // note down the sched_time, so we can locate this scrub, and remove it
+ // later on.
+ double scrub_min_interval = 0;
+ double scrub_max_interval = 0;
+ m_pg->pool.info.opts.get(pool_opts_t::SCRUB_MIN_INTERVAL, &scrub_min_interval);
+ m_pg->pool.info.opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &scrub_max_interval);
+
+ m_scrub_reg_stamp = m_osds->reg_pg_scrub(m_pg->info.pgid, reg_stamp, scrub_min_interval,
+ scrub_max_interval, must);
+ dout(15) << __func__ << " pg(" << m_pg_id << ") register next scrub, scrub time "
+ << m_scrub_reg_stamp << ", must = " << (int)must << dendl;
+}
+
+void PgScrubber::unreg_next_scrub()
+{
+ if (is_scrub_registered()) {
+ m_osds->unreg_pg_scrub(m_pg->info.pgid, m_scrub_reg_stamp);
+ m_scrub_reg_stamp = utime_t{};
+ }
+}
+
+/// debug/development temporary code:
+void PgScrubber::debug_dump_reservations(std::string_view header_txt) const
+{
+ std::string format;
+ auto f = Formatter::create(format, "json-pretty", "json-pretty");
+ m_osds->dump_scrub_reservations(f);
+ std::stringstream o;
+ f->flush(o);
+ dout(20) << header_txt << o.str() << dendl;
+ delete f;
+}
+
+void PgScrubber::scrub_requested(scrub_level_t scrub_level,
+ scrub_type_t scrub_type,
+ requested_scrub_t& req_flags)
+{
+ dout(10) << __func__ << (scrub_level == scrub_level_t::deep ? " deep " : " shallow ")
+ << (scrub_type == scrub_type_t::do_repair ? " repair-scrub " : " not-repair ")
+ << " prev stamp: " << m_scrub_reg_stamp << " " << is_scrub_registered()
+ << dendl;
+
+ debug_dump_reservations(" before_unreg ");
+
+ unreg_next_scrub();
+
+ req_flags.must_scrub = true;
+ req_flags.must_deep_scrub =
+ (scrub_level == scrub_level_t::deep) || (scrub_type == scrub_type_t::do_repair);
+ req_flags.must_repair = (scrub_type == scrub_type_t::do_repair);
+ // User might intervene, so clear this
+ req_flags.need_auto = false;
+ req_flags.req_scrub = true;
+
+ dout(20) << __func__ << " pg(" << m_pg_id << ") planned:" << req_flags << dendl;
+ debug_dump_reservations(" before_reg ");
+
+ reg_next_scrub(req_flags);
+
+ debug_dump_reservations(" after_reg ");
+}
+
+void PgScrubber::request_rescrubbing(requested_scrub_t& req_flags)
+{
+ dout(10) << __func__ << " existing-" << m_scrub_reg_stamp << " ## "
+ << is_scrub_registered() << dendl;
+ debug_dump_reservations(" auto-scrub before ");
+
+ unreg_next_scrub();
+ req_flags.need_auto = true;
+ reg_next_scrub(req_flags);
+
+ debug_dump_reservations(" auto-scrub after ");
+}
+
+bool PgScrubber::reserve_local()
+{
+ // try to create the reservation object (which translates into asking the
+ // OSD for the local scrub resource). If failing - undo it immediately
+
+ m_local_osd_resource.emplace(m_pg, m_osds);
+ if (!m_local_osd_resource->is_reserved()) {
+ m_local_osd_resource.reset();
+ return false;
+ }
+
+ return true;
+}
+
+// ----------------------------------------------------------------------------
+
+bool PgScrubber::has_pg_marked_new_updates() const
+{
+ auto last_applied = m_pg->recovery_state.get_last_update_applied();
+ dout(10) << __func__ << " recovery last: " << last_applied
+ << " vs. scrub's: " << m_subset_last_update << dendl;
+
+ return last_applied >= m_subset_last_update;
+}
+
+void PgScrubber::set_subset_last_update(eversion_t e)
+{
+ m_subset_last_update = e;
+}
+
+/*
+ * setting:
+ * - m_subset_last_update
+ * - m_max_end
+ * - end
+ * - start
+ * By:
+ * - setting tentative range based on conf and divisor
+ * - requesting a partial list of elements from the backend;
+ * - handling some head/clones issues
+ * - ...
+ *
+ * The selected range is set directly into 'm_start' and 'm_end'
+ */
+bool PgScrubber::select_range()
+{
+ m_primary_scrubmap = ScrubMap{};
+ m_received_maps.clear();
+
+ /* get the start and end of our scrub chunk
+ *
+ * Our scrub chunk has an important restriction we're going to need to
+ * respect. We can't let head be start or end.
+ * Using a half-open interval means that if end == head,
+ * we'd scrub/lock head and the clone right next to head in different
+ * chunks which would allow us to miss clones created between
+ * scrubbing that chunk and scrubbing the chunk including head.
+ * This isn't true for any of the other clones since clones can
+ * only be created "just to the left of" head. There is one exception
+ * to this: promotion of clones which always happens to the left of the
+ * left-most clone, but promote_object checks the scrubber in that
+ * case, so it should be ok. Also, it's ok to "miss" clones at the
+ * left end of the range if we are a tier because they may legitimately
+ * not exist (see _scrub).
+ */
+ int min_idx = std::max<int64_t>(
+ 3, m_pg->get_cct()->_conf->osd_scrub_chunk_min / preemption_data.chunk_divisor());
+
+ int max_idx = std::max<int64_t>(min_idx, m_pg->get_cct()->_conf->osd_scrub_chunk_max /
+ preemption_data.chunk_divisor());
+
+ // why mixing 'int' and int64_t? RRR
+
+ dout(10) << __func__ << " Min: " << min_idx << " Max: " << max_idx
+ << " Div: " << preemption_data.chunk_divisor() << dendl;
+
+ hobject_t start = m_start;
+ hobject_t candidate_end;
+ std::vector<hobject_t> objects;
+ int ret = m_pg->get_pgbackend()->objects_list_partial(start, min_idx, max_idx, &objects,
+ &candidate_end);
+ ceph_assert(ret >= 0);
+
+ if (!objects.empty()) {
+
+ hobject_t back = objects.back();
+ while (candidate_end.is_head() && candidate_end == back.get_head()) {
+ candidate_end = back;
+ objects.pop_back();
+ if (objects.empty()) {
+ ceph_assert(0 ==
+ "Somehow we got more than 2 objects which"
+ "have the same head but are not clones");
+ }
+ back = objects.back();
+ }
+
+ if (candidate_end.is_head()) {
+ ceph_assert(candidate_end != back.get_head());
+ candidate_end = candidate_end.get_object_boundary();
+ }
+
+ } else {
+ ceph_assert(candidate_end.is_max());
+ }
+
+ // is that range free for us? if not - we will be rescheduled later by whoever
+ // triggered us this time
+
+ if (!m_pg->_range_available_for_scrub(m_start, candidate_end)) {
+ // we'll be requeued by whatever made us unavailable for scrub
+ dout(10) << __func__ << ": scrub blocked somewhere in range "
+ << "[" << m_start << ", " << candidate_end << ")" << dendl;
+ return false;
+ }
+
+ m_end = candidate_end;
+ if (m_end > m_max_end)
+ m_max_end = m_end;
+
+ dout(15) << __func__ << " range selected: " << m_start << " //// " << m_end << " //// "
+ << m_max_end << dendl;
+ return true;
+}
+
+bool PgScrubber::write_blocked_by_scrub(const hobject_t& soid)
+{
+ if (soid < m_start || soid >= m_end) {
+ return false;
+ }
+
+ dout(10) << __func__ << " " << soid << " can preempt? "
+ << preemption_data.is_preemptable() << dendl;
+ dout(10) << __func__ << " " << soid << " already? " << preemption_data.was_preempted()
+ << dendl;
+
+ if (preemption_data.is_preemptable()) {
+
+ if (!preemption_data.was_preempted()) {
+ dout(10) << __func__ << " " << soid << " preempted" << dendl;
+
+ // signal the preemption
+ preemption_data.do_preempt();
+
+ } else {
+ dout(10) << __func__ << " " << soid << " already preempted" << dendl;
+ }
+ return false;
+ }
+ return true;
+}
+
+bool PgScrubber::range_intersects_scrub(const hobject_t& start, const hobject_t& end)
+{
+ // does [start, end] intersect [scrubber.start, scrubber.m_max_end)
+ return (start < m_max_end && end >= m_start);
+}
+
+/**
+ * if we are required to sleep:
+ * arrange a callback sometimes later.
+ * be sure to be able to identify a stale callback.
+ * Otherwise: perform a requeue (i.e. - rescheduling thru the OSD queue)
+ * anyway.
+ */
+void PgScrubber::add_delayed_scheduling()
+{
+ milliseconds sleep_time{0ms};
+ if (m_needs_sleep) {
+ double scrub_sleep = 1000.0 * m_osds->osd->scrub_sleep_time(m_flags.required);
+ dout(10) << __func__ << " sleep: " << scrub_sleep << dendl;
+ sleep_time = milliseconds{long(scrub_sleep)};
+ }
+ dout(15) << __func__ << " sleep: " << sleep_time.count() << " needed? " << m_needs_sleep
+ << dendl;
+
+ if (sleep_time.count()) {
+ // schedule a transition for some 'sleep_time' ms in the future
+
+ m_needs_sleep = false;
+ m_sleep_started_at = ceph_clock_now();
+
+ // the 'delayer' for crimson is different. Will be factored out.
+
+ spg_t pgid = m_pg->get_pgid();
+ auto callbk = new LambdaContext([osds = m_osds, pgid,
+ scrbr = this]([[maybe_unused]] int r) mutable {
+ PGRef pg = osds->osd->lookup_lock_pg(pgid);
+ if (!pg) {
+ lgeneric_subdout(g_ceph_context, osd, 10)
+ << "scrub_requeue_callback: Could not find "
+ << "PG " << pgid << " can't complete scrub requeue after sleep" << dendl;
+ return;
+ }
+ scrbr->m_needs_sleep = true;
+ lgeneric_dout(scrbr->get_pg_cct(), 7)
+ << "scrub_requeue_callback: slept for "
+ << ceph_clock_now() - scrbr->m_sleep_started_at << ", re-queuing scrub" << dendl;
+
+ scrbr->m_sleep_started_at = utime_t{};
+ osds->queue_for_scrub_resched(&(*pg), Scrub::scrub_prio_t::low_priority);
+ pg->unlock();
+ });
+
+ std::lock_guard l(m_osds->sleep_lock);
+ m_osds->sleep_timer.add_event_after(sleep_time.count() / 1000.0f, callbk);
+
+ } else {
+ // just a requeue
+ m_osds->queue_for_scrub_resched(m_pg, Scrub::scrub_prio_t::high_priority);
+ }
+}
+
+/**
+ * walk the log to find the latest update that affects our chunk
+ */
+eversion_t PgScrubber::search_log_for_updates() const
+{
+ auto& projected = m_pg->projected_log.log;
+ auto pi = find_if(
+ projected.crbegin(), projected.crend(),
+ [this](const auto& e) -> bool { return e.soid >= m_start && e.soid < m_end; });
+
+ if (pi != projected.crend())
+ return pi->version;
+
+ // there was no relevant update entry in the log
+
+ auto& log = m_pg->recovery_state.get_pg_log().get_log().log;
+ auto p = find_if(log.crbegin(), log.crend(), [this](const auto& e) -> bool {
+ return e.soid >= m_start && e.soid < m_end;
+ });
+
+ if (p == log.crend())
+ return eversion_t{};
+ else
+ return p->version;
+}
+
+bool PgScrubber::get_replicas_maps(bool replica_can_preempt)
+{
+ dout(10) << __func__ << " epoch_start: " << m_epoch_start
+ << " pg same_interval_since: " << m_pg->info.history.same_interval_since
+ << dendl;
+
+ bool do_have_replicas = false;
+
+ m_primary_scrubmap_pos.reset();
+
+ // ask replicas to scan and send maps
+ for (const auto& i : m_pg->get_acting_recovery_backfill()) {
+
+ if (i == m_pg_whoami)
+ continue;
+
+ do_have_replicas = true;
+ m_maps_status.mark_replica_map_request(i);
+ _request_scrub_map(i, m_subset_last_update, m_start, m_end, m_is_deep,
+ replica_can_preempt);
+ }
+
+ dout(10) << __func__ << " awaiting" << m_maps_status << dendl;
+ return do_have_replicas;
+}
+
+bool PgScrubber::was_epoch_changed() const
+{
+ // for crimson we have m_pg->get_info().history.same_interval_since
+ dout(10) << __func__ << " epoch_start: " << m_epoch_start
+ << " from pg: " << m_pg->get_history().same_interval_since << dendl;
+
+ return m_epoch_start < m_pg->get_history().same_interval_since;
+}
+
+void PgScrubber::mark_local_map_ready()
+{
+ m_maps_status.mark_local_map_ready();
+}
+
+bool PgScrubber::are_all_maps_available() const
+{
+ return m_maps_status.are_all_maps_available();
+}
+
+std::string PgScrubber::dump_awaited_maps() const
+{
+ return m_maps_status.dump();
+}
+
+void PgScrubber::_request_scrub_map(pg_shard_t replica,
+ eversion_t version,
+ hobject_t start,
+ hobject_t end,
+ bool deep,
+ bool allow_preemption)
+{
+ ceph_assert(replica != m_pg_whoami);
+ dout(10) << __func__ << " scrubmap from osd." << replica
+ << (deep ? " deep" : " shallow") << dendl;
+
+ auto repscrubop = new MOSDRepScrub(
+ spg_t(m_pg->info.pgid.pgid, replica.shard), version, m_pg->get_osdmap_epoch(),
+ m_pg->get_last_peering_reset(), start, end, deep, allow_preemption, m_flags.priority,
+ m_pg->ops_blocked_by_scrub());
+
+ // default priority. We want the replica-scrub processed prior to any recovery
+ // or client io messages (we are holding a lock!)
+ m_osds->send_message_osd_cluster(replica.osd, repscrubop, get_osdmap_epoch());
+}
+
+void PgScrubber::cleanup_store(ObjectStore::Transaction* t)
+{
+ if (!m_store)
+ return;
+
+ struct OnComplete : Context {
+ std::unique_ptr<Scrub::Store> store;
+ explicit OnComplete(std::unique_ptr<Scrub::Store>&& store) : store(std::move(store))
+ {}
+ void finish(int) override {}
+ };
+ m_store->cleanup(t);
+ t->register_on_complete(new OnComplete(std::move(m_store)));
+ ceph_assert(!m_store);
+}
+
+void PgScrubber::on_init()
+{
+ // going upwards from 'inactive'
+ ceph_assert(!is_scrub_active());
+
+ preemption_data.reset();
+ m_pg->publish_stats_to_osd();
+ m_epoch_start = m_pg->get_history().same_interval_since;
+
+ dout(10) << __func__ << " start same_interval:" << m_epoch_start << dendl;
+
+ // create a new store
+ {
+ ObjectStore::Transaction t;
+ cleanup_store(&t);
+ m_store.reset(
+ Scrub::Store::create(m_pg->osd->store, &t, m_pg->info.pgid, m_pg->coll));
+ m_pg->osd->store->queue_transaction(m_pg->ch, std::move(t), nullptr);
+ }
+
+ m_start = m_pg->info.pgid.pgid.get_hobj_start();
+ m_active = true;
+}
+
+void PgScrubber::on_replica_init()
+{
+ ceph_assert(!m_active);
+ m_active = true;
+}
+
+void PgScrubber::_scan_snaps(ScrubMap& smap)
+{
+ hobject_t head;
+ SnapSet snapset;
+
+ // Test qa/standalone/scrub/osd-scrub-snaps.sh uses this message to verify
+ // caller using clean_meta_map(), and it works properly.
+ dout(15) << __func__ << " starts" << dendl;
+
+ for (auto i = smap.objects.rbegin(); i != smap.objects.rend(); ++i) {
+
+ const hobject_t& hoid = i->first;
+ ScrubMap::object& o = i->second;
+
+ dout(20) << __func__ << " " << hoid << dendl;
+
+ ceph_assert(!hoid.is_snapdir());
+ if (hoid.is_head()) {
+ // parse the SnapSet
+ bufferlist bl;
+ if (o.attrs.find(SS_ATTR) == o.attrs.end()) {
+ continue;
+ }
+ bl.push_back(o.attrs[SS_ATTR]);
+ auto p = bl.cbegin();
+ try {
+ decode(snapset, p);
+ } catch (...) {
+ continue;
+ }
+ head = hoid.get_head();
+ continue;
+ }
+
+ if (hoid.snap < CEPH_MAXSNAP) {
+ // check and if necessary fix snap_mapper
+ if (hoid.get_head() != head) {
+ derr << __func__ << " no head for " << hoid << " (have " << head << ")" << dendl;
+ continue;
+ }
+ set<snapid_t> obj_snaps;
+ auto p = snapset.clone_snaps.find(hoid.snap);
+ if (p == snapset.clone_snaps.end()) {
+ derr << __func__ << " no clone_snaps for " << hoid << " in " << snapset << dendl;
+ continue;
+ }
+ obj_snaps.insert(p->second.begin(), p->second.end());
+ set<snapid_t> cur_snaps;
+ int r = m_pg->snap_mapper.get_snaps(hoid, &cur_snaps);
+ if (r != 0 && r != -ENOENT) {
+ derr << __func__ << ": get_snaps returned " << cpp_strerror(r) << dendl;
+ ceph_abort();
+ }
+ if (r == -ENOENT || cur_snaps != obj_snaps) {
+ ObjectStore::Transaction t;
+ OSDriver::OSTransaction _t(m_pg->osdriver.get_transaction(&t));
+ if (r == 0) {
+ r = m_pg->snap_mapper.remove_oid(hoid, &_t);
+ if (r != 0) {
+ derr << __func__ << ": remove_oid returned " << cpp_strerror(r) << dendl;
+ ceph_abort();
+ }
+ m_pg->osd->clog->error()
+ << "osd." << m_pg->osd->whoami << " found snap mapper error on pg "
+ << m_pg->info.pgid << " oid " << hoid << " snaps in mapper: " << cur_snaps
+ << ", oi: " << obj_snaps << "...repaired";
+ } else {
+ m_pg->osd->clog->error()
+ << "osd." << m_pg->osd->whoami << " found snap mapper error on pg "
+ << m_pg->info.pgid << " oid " << hoid << " snaps missing in mapper"
+ << ", should be: " << obj_snaps << " was " << cur_snaps << " r " << r
+ << "...repaired";
+ }
+ m_pg->snap_mapper.add_oid(hoid, obj_snaps, &_t);
+
+ // wait for repair to apply to avoid confusing other bits of the system.
+ {
+ dout(15) << __func__ << " wait on repair!" << dendl;
+
+ ceph::condition_variable my_cond;
+ ceph::mutex my_lock = ceph::make_mutex("PG::_scan_snaps my_lock");
+ int e = 0;
+ bool done;
+
+ t.register_on_applied_sync(new C_SafeCond(my_lock, my_cond, &done, &e));
+
+ e = m_pg->osd->store->queue_transaction(m_pg->ch, std::move(t));
+ if (e != 0) {
+ derr << __func__ << ": queue_transaction got " << cpp_strerror(e) << dendl;
+ } else {
+ std::unique_lock l{my_lock};
+ my_cond.wait(l, [&done] { return done; });
+ }
+ }
+ }
+ }
+ }
+}
+
+int PgScrubber::build_primary_map_chunk()
+{
+ auto ret = build_scrub_map_chunk(m_primary_scrubmap, m_primary_scrubmap_pos, m_start,
+ m_end, m_is_deep);
+
+ if (ret == -EINPROGRESS)
+ m_osds->queue_for_scrub_resched(m_pg, Scrub::scrub_prio_t::high_priority);
+
+ return ret;
+}
+
+int PgScrubber::build_replica_map_chunk()
+{
+ dout(10) << __func__ << " epoch start: " << m_epoch_start << " ep q: " << m_epoch_queued
+ << dendl;
+ dout(10) << __func__ << " deep: " << m_is_deep << dendl;
+
+ auto ret = build_scrub_map_chunk(replica_scrubmap, replica_scrubmap_pos, m_start, m_end,
+ m_is_deep);
+
+ if (ret == 0) {
+
+ // finished!
+ // In case we restarted smaller chunk, clear old data
+
+ ScrubMap for_meta_scrub;
+ m_cleaned_meta_map.clear_from(m_start);
+ m_cleaned_meta_map.insert(replica_scrubmap);
+ clean_meta_map(for_meta_scrub);
+ _scan_snaps(for_meta_scrub);
+ }
+
+ // previous version used low priority here. Now switched to using the priority
+ // of the original message
+ if (ret == -EINPROGRESS)
+ requeue_replica(m_replica_request_priority);
+
+ return ret;
+}
+
+int PgScrubber::build_scrub_map_chunk(
+ ScrubMap& map, ScrubMapBuilder& pos, hobject_t start, hobject_t end, bool deep)
+{
+ dout(10) << __func__ << " [" << start << "," << end << ") "
+ << " pos " << pos << " Deep: " << deep << dendl;
+
+ // start
+ while (pos.empty()) {
+
+ pos.deep = deep;
+ map.valid_through = m_pg->info.last_update;
+
+ // objects
+ vector<ghobject_t> rollback_obs;
+ pos.ret =
+ m_pg->get_pgbackend()->objects_list_range(start, end, &pos.ls, &rollback_obs);
+ dout(10) << __func__ << " while pos empty " << pos.ret << dendl;
+ if (pos.ret < 0) {
+ dout(5) << "objects_list_range error: " << pos.ret << dendl;
+ return pos.ret;
+ }
+ dout(10) << __func__ << " pos.ls.empty()? " << (pos.ls.empty() ? "+" : "-") << dendl;
+ if (pos.ls.empty()) {
+ break;
+ }
+ m_pg->_scan_rollback_obs(rollback_obs);
+ pos.pos = 0;
+ return -EINPROGRESS;
+ }
+
+ // scan objects
+ while (!pos.done()) {
+ int r = m_pg->get_pgbackend()->be_scan_list(map, pos);
+ dout(10) << __func__ << " be r " << r << dendl;
+ if (r == -EINPROGRESS) {
+ dout(8 /*20*/) << __func__ << " in progress" << dendl;
+ return r;
+ }
+ }
+
+ // finish
+ dout(8 /*20*/) << __func__ << " finishing" << dendl;
+ ceph_assert(pos.done());
+ m_pg->_repair_oinfo_oid(map);
+
+ dout(8 /*20*/) << __func__ << " done, got " << map.objects.size() << " items" << dendl;
+ return 0;
+}
+
+/**
+ * \todo describe what we are doing here
+ *
+ * @param for_meta_scrub
+ */
+void PgScrubber::clean_meta_map(ScrubMap& for_meta_scrub)
+{
+ if (m_end.is_max() || m_cleaned_meta_map.objects.empty()) {
+ m_cleaned_meta_map.swap(for_meta_scrub);
+ } else {
+ auto iter = m_cleaned_meta_map.objects.end();
+ --iter; // not empty, see 'if' clause
+ auto begin = m_cleaned_meta_map.objects.begin();
+ if (iter->first.has_snapset()) {
+ ++iter;
+ } else {
+ while (iter != begin) {
+ auto next = iter--;
+ if (next->first.get_head() != iter->first.get_head()) {
+ ++iter;
+ break;
+ }
+ }
+ }
+ for_meta_scrub.objects.insert(begin, iter);
+ m_cleaned_meta_map.objects.erase(begin, iter);
+ }
+}
+
+void PgScrubber::run_callbacks()
+{
+ std::list<Context*> to_run;
+ to_run.swap(m_callbacks);
+
+ for (auto& tr : to_run) {
+ tr->complete(0);
+ }
+}
+
+void PgScrubber::maps_compare_n_cleanup()
+{
+ scrub_compare_maps();
+ m_start = m_end;
+ run_callbacks();
+ requeue_waiting();
+}
+
+Scrub::preemption_t* PgScrubber::get_preemptor()
+{
+ return &preemption_data;
+}
+
+void PgScrubber::requeue_replica(Scrub::scrub_prio_t is_high_priority)
+{
+ dout(10) << __func__ << dendl;
+ m_osds->queue_for_rep_scrub_resched(m_pg, is_high_priority, m_flags.priority);
+}
+
+/*
+ * Process note: called for the arriving "give me your map, replica!" request. Unlike
+ * the original implementation, we do not requeue the Op waiting for
+ * updates. Instead - we trigger the FSM.
+ */
+void PgScrubber::replica_scrub_op(OpRequestRef op)
+{
+ auto msg = op->get_req<MOSDRepScrub>();
+ dout(10) << __func__ << " pg:" << m_pg->pg_id << " Msg: map_epoch:" << msg->map_epoch
+ << " min_epoch:" << msg->min_epoch << " deep?" << msg->deep << dendl;
+
+ if (msg->map_epoch < m_pg->info.history.same_interval_since) {
+ dout(10) << "replica_scrub_op discarding old replica_scrub from " << msg->map_epoch
+ << " < " << m_pg->info.history.same_interval_since << dendl;
+ return;
+ }
+
+ replica_scrubmap = ScrubMap{};
+ replica_scrubmap_pos = ScrubMapBuilder{};
+
+ // m_replica_epoch_start is overwritten if requeued waiting for active pushes
+ m_replica_epoch_start = m_pg->info.history.same_interval_since;
+ m_replica_min_epoch = msg->min_epoch;
+ m_start = msg->start;
+ m_end = msg->end;
+ m_max_end = msg->end;
+ m_is_deep = msg->deep;
+ m_epoch_start = m_pg->info.history.same_interval_since;
+ m_replica_request_priority = msg->high_priority ? Scrub::scrub_prio_t::high_priority
+ : Scrub::scrub_prio_t::low_priority;
+ m_flags.priority = msg->priority ? msg->priority : m_pg->get_scrub_priority();
+
+ preemption_data.reset();
+ preemption_data.force_preemptability(msg->allow_preemption);
+
+ replica_scrubmap_pos.reset();
+
+ // make sure the FSM is at NotActive
+ m_fsm->assert_not_active();
+
+ m_osds->queue_for_rep_scrub(m_pg, m_replica_request_priority, m_flags.priority);
+}
+
+void PgScrubber::replica_scrub(epoch_t epoch_queued)
+{
+ dout(10) << __func__ << ": " << m_pg->pg_id << " epoch queued: " << epoch_queued
+ << dendl;
+ dout(20) << __func__ << " m_epoch_start: " << m_epoch_start
+ << " better be >= " << m_pg->info.history.same_interval_since << dendl;
+ dout(20) << __func__ << " m_is_deep: " << m_is_deep << dendl;
+
+ if (m_pg->pg_has_reset_since(epoch_queued)) {
+ dout(10) << "replica_scrub(epoch,) - reset!" << dendl;
+ send_epoch_changed();
+ return;
+ }
+
+ if (was_epoch_changed()) {
+ dout(10) << "replica_scrub(epoch,) - epoch!" << dendl;
+ send_epoch_changed();
+ return;
+ }
+ ceph_assert(!is_primary()); // as should have been caught by the epoch-changed check
+
+ send_start_replica();
+}
+
+void PgScrubber::replica_scrub_resched(epoch_t epoch_queued)
+{
+ dout(10) << __func__ << ": " << m_pg->pg_id << " epoch queued: " << epoch_queued
+ << dendl;
+
+ if (m_pg->pg_has_reset_since(epoch_queued)) {
+ dout(10) << "replica_scrub(epoch,) - reset!" << dendl;
+ send_epoch_changed();
+ return;
+ }
+
+ if (was_epoch_changed()) {
+ dout(10) << __func__ << " epoch changed!" << dendl;
+ send_epoch_changed();
+ return;
+ }
+ ceph_assert(!is_primary()); // as should have been caught by the epoch-changed check
+
+ send_sched_replica();
+}
+
+void PgScrubber::set_op_parameters(requested_scrub_t& request)
+{
+ dout(10) << __func__ << " input: " << request << dendl;
+
+ m_flags.check_repair = request.check_repair;
+ m_flags.auto_repair = request.auto_repair || request.need_auto;
+ m_flags.required = request.req_scrub || request.must_scrub;
+
+ m_flags.priority = (request.must_scrub || request.need_auto)
+ ? get_pg_cct()->_conf->osd_requested_scrub_priority
+ : m_pg->get_scrub_priority();
+
+ state_set(PG_STATE_SCRUBBING);
+
+ // will we be deep-scrubbing?
+ if (request.must_deep_scrub || request.need_auto || request.time_for_deep) {
+ state_set(PG_STATE_DEEP_SCRUB);
+ }
+
+ if (request.must_repair || m_flags.auto_repair) {
+ state_set(PG_STATE_REPAIR);
+ }
+
+ // the publishing here seems to be required for tests synchronization
+ m_pg->publish_stats_to_osd();
+ m_flags.deep_scrub_on_error = request.deep_scrub_on_error;
+ request = requested_scrub_t{};
+}
+
+/**
+ * RRR \todo ask why we collect from acting+recovery+backfill, but use the size of
+ * only the acting set
+ */
+void PgScrubber::scrub_compare_maps()
+{
+ dout(10) << __func__ << " has maps, analyzing" << dendl;
+
+ // construct authoritative scrub map for type-specific scrubbing
+ m_cleaned_meta_map.insert(m_primary_scrubmap);
+ map<hobject_t, pair<std::optional<uint32_t>, std::optional<uint32_t>>> missing_digest;
+
+ map<pg_shard_t, ScrubMap*> maps;
+ maps[m_pg_whoami] = &m_primary_scrubmap;
+
+ for (const auto& i : m_pg->get_acting_recovery_backfill()) {
+ if (i == m_pg_whoami)
+ continue;
+ dout(2) << __func__ << " replica " << i << " has "
+ << m_received_maps[i].objects.size() << " items" << dendl;
+ maps[i] = &m_received_maps[i];
+ }
+
+ set<hobject_t> master_set;
+
+ // Construct master set
+ for (const auto& map : maps) {
+ for (const auto& i : map.second->objects) {
+ master_set.insert(i.first);
+ }
+ }
+
+ stringstream ss;
+ m_pg->get_pgbackend()->be_omap_checks(maps, master_set, m_omap_stats, ss);
+
+ if (!ss.str().empty()) {
+ m_osds->clog->warn(ss);
+ }
+
+ if (m_pg->recovery_state.get_acting().size() > 1) {
+
+ // RRR add a comment here
+
+ dout(10) << __func__ << " comparing replica scrub maps" << dendl;
+
+ // Map from object with errors to good peer
+ map<hobject_t, list<pg_shard_t>> authoritative;
+
+ dout(2) << __func__ << m_pg->get_primary() << " has "
+ << m_primary_scrubmap.objects.size() << " items" << dendl;
+
+ ss.str("");
+ ss.clear();
+
+ m_pg->get_pgbackend()->be_compare_scrubmaps(
+ maps, master_set, state_test(PG_STATE_REPAIR), m_missing, m_inconsistent,
+ authoritative, missing_digest, m_shallow_errors, m_deep_errors, m_store.get(),
+ m_pg->info.pgid, m_pg->recovery_state.get_acting(), ss);
+ dout(2) << ss.str() << dendl;
+
+ if (!ss.str().empty()) {
+ m_osds->clog->error(ss);
+ }
+
+ for (auto& i : authoritative) {
+ list<pair<ScrubMap::object, pg_shard_t>> good_peers;
+ for (list<pg_shard_t>::const_iterator j = i.second.begin(); j != i.second.end();
+ ++j) {
+ good_peers.emplace_back(maps[*j]->objects[i.first], *j);
+ }
+ m_authoritative.emplace(i.first, good_peers);
+ }
+
+ for (auto i = authoritative.begin(); i != authoritative.end(); ++i) {
+ m_cleaned_meta_map.objects.erase(i->first);
+ m_cleaned_meta_map.objects.insert(
+ *(maps[i->second.back()]->objects.find(i->first)));
+ }
+ }
+
+ ScrubMap for_meta_scrub;
+ clean_meta_map(for_meta_scrub);
+
+ // ok, do the pg-type specific scrubbing
+
+ // (Validates consistency of the object info and snap sets)
+ scrub_snapshot_metadata(for_meta_scrub, missing_digest);
+
+ // Called here on the primary can use an authoritative map if it isn't the primary
+ _scan_snaps(for_meta_scrub);
+
+ if (!m_store->empty()) {
+
+ if (state_test(PG_STATE_REPAIR)) {
+ dout(10) << __func__ << ": discarding scrub results" << dendl;
+ m_store->flush(nullptr);
+ } else {
+ dout(10) << __func__ << ": updating scrub object" << dendl;
+ ObjectStore::Transaction t;
+ m_store->flush(&t);
+ m_pg->osd->store->queue_transaction(m_pg->ch, std::move(t), nullptr);
+ }
+ }
+}
+
+void PgScrubber::replica_update_start_epoch()
+{
+ dout(10) << __func__ << " start:" << m_pg->info.history.same_interval_since << dendl;
+ m_replica_epoch_start = m_pg->info.history.same_interval_since;
+}
+
+/**
+ * Send the requested map back to the primary (or - if we
+ * were preempted - let the primary know).
+ */
+void PgScrubber::send_replica_map(bool was_preempted)
+{
+ dout(10) << __func__ << " min epoch:" << m_replica_min_epoch
+ << " epoch_start:" << m_replica_epoch_start << dendl;
+
+ auto reply = new MOSDRepScrubMap(spg_t(m_pg->info.pgid.pgid, m_pg->get_primary().shard),
+ m_replica_min_epoch, m_pg_whoami);
+
+ reply->preempted = was_preempted;
+ ::encode(replica_scrubmap, reply->get_data());
+
+ m_osds->send_message_osd_cluster(m_pg->get_primary().osd, reply, m_replica_min_epoch);
+}
+
+/**
+ * - if the replica lets us know it was interrupted, we mark the chunk as interrupted.
+ * The state-machine will react to that when all replica maps are received.
+ * - when all maps are received, we signal the FSM with the GotReplicas event (see
+ * scrub_send_replmaps_ready()). Note that due to the no-reentrancy limitations of the
+ * FSM, we do not 'process' the event directly. Instead - it is queued for the OSD to
+ * handle (well - the incoming message is marked for fast dispatching, which is an
+ * even better reason for handling it via the queue).
+ */
+void PgScrubber::map_from_replica(OpRequestRef op)
+{
+ auto m = op->get_req<MOSDRepScrubMap>();
+ dout(15) << __func__ << " " << *m << dendl;
+
+ if (m->map_epoch < m_pg->info.history.same_interval_since) {
+ dout(10) << __func__ << " discarding old from " << m->map_epoch << " < "
+ << m_pg->info.history.same_interval_since << dendl;
+ return;
+ }
+
+ auto p = const_cast<bufferlist&>(m->get_data()).cbegin();
+
+ m_received_maps[m->from].decode(p, m_pg->info.pgid.pool());
+ dout(15) << "map version is " << m_received_maps[m->from].valid_through << dendl;
+
+ [[maybe_unused]] auto [is_ok, err_txt] = m_maps_status.mark_arriving_map(m->from);
+ ceph_assert(is_ok); // and not an error message, following the original code
+
+ if (m->preempted) {
+ dout(10) << __func__ << " replica was preempted, setting flag" << dendl;
+ ceph_assert(preemption_data.is_preemptable()); // otherwise - how dare the replica!
+ preemption_data.do_preempt();
+ }
+
+ if (m_maps_status.are_all_maps_available()) {
+ dout(10) << __func__ << " osd-queuing GotReplicas" << dendl;
+ m_osds->queue_scrub_got_repl_maps(m_pg, m_pg->is_scrub_blocking_ops());
+ }
+}
+
+/**
+ * we are a replica being asked by the Primary to reserve OSD resources for
+ * scrubbing
+ */
+void PgScrubber::handle_scrub_reserve_request(OpRequestRef op)
+{
+ dout(10) << __func__ << " " << *op->get_req() << dendl;
+ op->mark_started();
+
+ if (m_remote_osd_resource.has_value() && m_remote_osd_resource->is_reserved()) {
+ dout(10) << __func__ << " ignoring reserve request: Already reserved" << dendl;
+ return;
+ }
+
+ bool granted{false};
+
+ if (m_pg->cct->_conf->osd_scrub_during_recovery || !m_osds->is_recovery_active()) {
+
+ m_remote_osd_resource.emplace(m_pg, m_osds);
+ // OSD resources allocated?
+ granted = m_remote_osd_resource->is_reserved();
+ if (!granted) {
+ // just forget it
+ m_remote_osd_resource.reset();
+ dout(20) << __func__ << ": failed to reserve remotely" << dendl;
+ }
+ }
+
+ dout(10) << __func__ << " reserved? " << (granted ? "yes" : "no") << dendl;
+
+ auto m = op->get_req<MOSDScrubReserve>();
+ Message* reply = new MOSDScrubReserve(
+ spg_t(m_pg->info.pgid.pgid, m_pg->get_primary().shard), m->map_epoch,
+ granted ? MOSDScrubReserve::GRANT : MOSDScrubReserve::REJECT, m_pg_whoami);
+
+ m_osds->send_message_osd_cluster(reply, op->get_req()->get_connection());
+}
+
+void PgScrubber::handle_scrub_reserve_grant(OpRequestRef op, pg_shard_t from)
+{
+ dout(10) << __func__ << " " << *op->get_req() << dendl;
+ op->mark_started();
+
+ if (m_reservations.has_value()) {
+ m_reservations->handle_reserve_grant(op, from);
+ } else {
+ derr << __func__ << ": replica scrub reservations that will be leaked!" << dendl;
+ }
+}
+
+void PgScrubber::handle_scrub_reserve_reject(OpRequestRef op, pg_shard_t from)
+{
+ dout(10) << __func__ << " " << *op->get_req() << dendl;
+ op->mark_started();
+
+ if (m_reservations.has_value()) {
+ // there is an active reservation process. No action is required otherwise.
+ m_reservations->handle_reserve_reject(op, from);
+ }
+}
+
+void PgScrubber::handle_scrub_reserve_release(OpRequestRef op)
+{
+ dout(10) << __func__ << " " << *op->get_req() << dendl;
+ op->mark_started();
+ m_remote_osd_resource.reset();
+}
+
+void PgScrubber::clear_scrub_reservations()
+{
+ dout(10) << __func__ << dendl;
+ m_reservations.reset(); // the remote reservations
+ m_local_osd_resource.reset(); // the local reservation
+ m_remote_osd_resource.reset(); // we as replica reserved for a Primary
+}
+
+void PgScrubber::message_all_replicas(int32_t opcode, std::string_view op_text)
+{
+ ceph_assert(m_pg->recovery_state.get_backfill_targets()
+ .empty()); // RRR ask: (the code was copied as is) Why checking here?
+
+ std::vector<std::pair<int, Message*>> messages;
+ messages.reserve(m_pg->get_actingset().size());
+
+ epoch_t epch = get_osdmap_epoch();
+
+ for (auto& p : m_pg->get_actingset()) {
+
+ if (p == m_pg_whoami)
+ continue;
+
+ dout(10) << "scrub requesting " << op_text << " from osd." << p << " Epoch: " << epch
+ << dendl;
+ Message* m = new MOSDScrubReserve(spg_t(m_pg->info.pgid.pgid, p.shard), epch, opcode,
+ m_pg_whoami);
+ messages.push_back(std::make_pair(p.osd, m));
+ }
+
+ if (!messages.empty()) {
+ m_osds->send_message_osd_cluster(messages, epch);
+ }
+}
+
+void PgScrubber::unreserve_replicas()
+{
+ dout(10) << __func__ << dendl;
+ m_reservations.reset();
+}
+
+[[nodiscard]] bool PgScrubber::scrub_process_inconsistent()
+{
+ dout(10) << __func__ << ": checking authoritative" << dendl;
+
+ bool repair = state_test(PG_STATE_REPAIR);
+ const bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB);
+ const char* mode = (repair ? "repair" : (deep_scrub ? "deep-scrub" : "scrub"));
+ dout(20) << __func__ << " deep_scrub: " << deep_scrub << " m_is_deep: " << m_is_deep
+ << " repair: " << repair << dendl;
+
+ // authoritative only store objects which are missing or inconsistent.
+ if (!m_authoritative.empty()) {
+
+ stringstream ss;
+ ss << m_pg->info.pgid << " " << mode << " " << m_missing.size() << " missing, "
+ << m_inconsistent.size() << " inconsistent objects";
+ dout(2) << ss.str() << dendl;
+ m_osds->clog->error(ss);
+
+ if (repair) {
+ state_clear(PG_STATE_CLEAN);
+
+ for (const auto& [hobj, shrd_list] : m_authoritative) {
+
+ auto missing_entry = m_missing.find(hobj);
+
+ if (missing_entry != m_missing.end()) {
+ m_pg->repair_object(hobj, shrd_list, missing_entry->second);
+ m_fixed_count += missing_entry->second.size();
+ }
+
+ if (m_inconsistent.count(hobj)) {
+ m_pg->repair_object(hobj, shrd_list, m_inconsistent[hobj]);
+ m_fixed_count += m_inconsistent[hobj].size();
+ }
+ }
+ }
+ }
+ return (!m_authoritative.empty() && repair);
+}
+
+/*
+ * note: only called for the Primary.
+ */
+void PgScrubber::scrub_finish()
+{
+ dout(10) << __func__ << " before flags: " << m_flags
+ << " deep_scrub_on_error: " << m_flags.deep_scrub_on_error << dendl;
+
+ ceph_assert(m_pg->is_locked());
+
+ // if the repair request comes from auto-repair and large number of errors,
+ // we would like to cancel auto-repair
+
+ bool repair = state_test(PG_STATE_REPAIR);
+ if (repair && m_flags.auto_repair &&
+ m_authoritative.size() > m_pg->cct->_conf->osd_scrub_auto_repair_num_errors) {
+
+ dout(10) << __func__ << " undoing the repair" << dendl;
+ state_clear(PG_STATE_REPAIR);
+ repair = false;
+ }
+
+ bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB);
+ const char* mode = (repair ? "repair" : (deep_scrub ? "deep-scrub" : "scrub"));
+ bool do_auto_scrub = false;
+
+ // if a regular scrub had errors within the limit, do a deep scrub to auto repair
+ if (m_flags.deep_scrub_on_error && m_authoritative.size() &&
+ m_authoritative.size() <= m_pg->cct->_conf->osd_scrub_auto_repair_num_errors) {
+ ceph_assert(!deep_scrub);
+ do_auto_scrub = true;
+ dout(15) << __func__ << " Try to auto repair after scrub errors" << dendl;
+ }
+
+ m_flags.deep_scrub_on_error = false;
+
+ // type-specific finish (can tally more errors)
+ _scrub_finish();
+
+ bool has_error = scrub_process_inconsistent();
+
+ {
+ stringstream oss;
+ oss << m_pg->info.pgid.pgid << " " << mode << " ";
+ int total_errors = m_shallow_errors + m_deep_errors;
+ if (total_errors)
+ oss << total_errors << " errors";
+ else
+ oss << "ok";
+ if (!deep_scrub && m_pg->info.stats.stats.sum.num_deep_scrub_errors)
+ oss << " ( " << m_pg->info.stats.stats.sum.num_deep_scrub_errors
+ << " remaining deep scrub error details lost)";
+ if (repair)
+ oss << ", " << m_fixed_count << " fixed";
+ if (total_errors)
+ m_osds->clog->error(oss);
+ else
+ m_osds->clog->debug(oss);
+ }
+
+ // Since we don't know which errors were fixed, we can only clear them
+ // when every one has been fixed.
+ if (repair) {
+ if (m_fixed_count == m_shallow_errors + m_deep_errors) {
+
+ ceph_assert(deep_scrub);
+ m_shallow_errors = 0;
+ m_deep_errors = 0;
+ dout(20) << __func__ << " All may be fixed" << dendl;
+
+ } else if (has_error) {
+
+ // Deep scrub in order to get corrected error counts
+ m_pg->scrub_after_recovery = true;
+ m_pg->m_planned_scrub.req_scrub =
+ m_pg->m_planned_scrub.req_scrub || m_flags.required;
+
+ dout(20) << __func__ << " Current 'required': " << m_flags.required
+ << " Planned 'req_scrub': " << m_pg->m_planned_scrub.req_scrub << dendl;
+
+ } else if (m_shallow_errors || m_deep_errors) {
+
+ // We have errors but nothing can be fixed, so there is no repair
+ // possible.
+ state_set(PG_STATE_FAILED_REPAIR);
+ dout(10) << __func__ << " " << (m_shallow_errors + m_deep_errors)
+ << " error(s) present with no repair possible" << dendl;
+ }
+ }
+
+ {
+ // finish up
+ ObjectStore::Transaction t;
+ m_pg->recovery_state.update_stats(
+ [this, deep_scrub](auto& history, auto& stats) {
+ dout(10) << "m_pg->recovery_state.update_stats()" << dendl;
+ utime_t now = ceph_clock_now();
+ history.last_scrub = m_pg->recovery_state.get_info().last_update;
+ history.last_scrub_stamp = now;
+ if (m_is_deep) {
+ history.last_deep_scrub = m_pg->recovery_state.get_info().last_update;
+ history.last_deep_scrub_stamp = now;
+ }
+
+ if (deep_scrub) {
+ if ((m_shallow_errors == 0) && (m_deep_errors == 0))
+ history.last_clean_scrub_stamp = now;
+ stats.stats.sum.num_shallow_scrub_errors = m_shallow_errors;
+ stats.stats.sum.num_deep_scrub_errors = m_deep_errors;
+ stats.stats.sum.num_large_omap_objects = m_omap_stats.large_omap_objects;
+ stats.stats.sum.num_omap_bytes = m_omap_stats.omap_bytes;
+ stats.stats.sum.num_omap_keys = m_omap_stats.omap_keys;
+ dout(10 /*25*/) << "scrub_finish shard " << m_pg_whoami
+ << " num_omap_bytes = " << stats.stats.sum.num_omap_bytes
+ << " num_omap_keys = " << stats.stats.sum.num_omap_keys
+ << dendl;
+ } else {
+ stats.stats.sum.num_shallow_scrub_errors = m_shallow_errors;
+ // XXX: last_clean_scrub_stamp doesn't mean the pg is not inconsistent
+ // because of deep-scrub errors
+ if (m_shallow_errors == 0)
+ history.last_clean_scrub_stamp = now;
+ }
+ stats.stats.sum.num_scrub_errors = stats.stats.sum.num_shallow_scrub_errors +
+ stats.stats.sum.num_deep_scrub_errors;
+ if (m_flags.check_repair) {
+ m_flags.check_repair = false;
+ if (m_pg->info.stats.stats.sum.num_scrub_errors) {
+ state_set(PG_STATE_FAILED_REPAIR);
+ dout(10) << "scrub_finish " << m_pg->info.stats.stats.sum.num_scrub_errors
+ << " error(s) still present after re-scrub" << dendl;
+ }
+ }
+ return true;
+ },
+ &t);
+ int tr = m_osds->store->queue_transaction(m_pg->ch, std::move(t), nullptr);
+ ceph_assert(tr == 0);
+
+ if (!m_pg->snap_trimq.empty()) {
+ dout(10) << "scrub finished, requeuing snap_trimmer" << dendl;
+ m_pg->snap_trimmer_scrub_complete();
+ }
+ }
+
+ if (has_error) {
+ m_pg->queue_peering_event(PGPeeringEventRef(std::make_shared<PGPeeringEvent>(
+ get_osdmap_epoch(), get_osdmap_epoch(), PeeringState::DoRecovery())));
+ } else {
+ state_clear(PG_STATE_REPAIR);
+ }
+
+ cleanup_on_finish();
+ if (do_auto_scrub) {
+ request_rescrubbing(m_pg->m_planned_scrub);
+ }
+
+ if (m_pg->is_active() && m_pg->is_primary()) {
+ m_pg->recovery_state.share_pg_info();
+ }
+}
+
+Scrub::FsmNext PgScrubber::on_digest_updates()
+{
+ dout(10) << __func__ << " #pending: " << num_digest_updates_pending << " are we done? "
+ << num_digest_updates_pending
+ << (m_end.is_max() ? " <last chunk> " : " <mid chunk> ") << dendl;
+
+ if (num_digest_updates_pending == 0) {
+
+ // got all updates, and finished with this chunk. Any more?
+ if (m_end.is_max()) {
+ scrub_finish();
+ return Scrub::FsmNext::goto_notactive;
+ } else {
+ // go get a new chunk (via "requeue")
+ preemption_data.reset();
+ return Scrub::FsmNext::next_chunk;
+ }
+ } else {
+ return Scrub::FsmNext::do_discard;
+ }
+}
+
+/*
+ * note that the flags-set fetched from the PG (m_pg->m_planned_scrub)
+ * is cleared once scrubbing starts; Some of the values dumped here are
+ * thus transitory.
+ */
+void PgScrubber::dump(ceph::Formatter* f) const
+{
+ f->open_object_section("scrubber");
+ f->dump_stream("epoch_start") << m_epoch_start;
+ f->dump_bool("active", m_active);
+ if (m_active) {
+ f->dump_stream("start") << m_start;
+ f->dump_stream("end") << m_end;
+ f->dump_stream("m_max_end") << m_max_end;
+ f->dump_stream("subset_last_update") << m_subset_last_update;
+ f->dump_bool("deep", m_is_deep);
+ f->dump_bool("must_scrub", (m_pg->m_planned_scrub.must_scrub || m_flags.required));
+ f->dump_bool("must_deep_scrub", m_pg->m_planned_scrub.must_deep_scrub);
+ f->dump_bool("must_repair", m_pg->m_planned_scrub.must_repair);
+ f->dump_bool("need_auto", m_pg->m_planned_scrub.need_auto);
+ f->dump_bool("req_scrub", m_flags.required);
+ f->dump_bool("time_for_deep", m_pg->m_planned_scrub.time_for_deep);
+ f->dump_bool("auto_repair", m_flags.auto_repair);
+ f->dump_bool("check_repair", m_flags.check_repair);
+ f->dump_bool("deep_scrub_on_error", m_flags.deep_scrub_on_error);
+ f->dump_stream("scrub_reg_stamp") << m_scrub_reg_stamp; // utime_t
+ f->dump_unsigned("priority", m_flags.priority);
+ f->dump_int("shallow_errors", m_shallow_errors);
+ f->dump_int("deep_errors", m_deep_errors);
+ f->dump_int("fixed", m_fixed_count);
+ {
+ f->open_array_section("waiting_on_whom");
+ for (const auto& p : m_maps_status.get_awaited()) {
+ f->dump_stream("shard") << p;
+ }
+ f->close_section();
+ }
+ }
+ f->close_section();
+}
+
+
+void PgScrubber::handle_query_state(ceph::Formatter* f)
+{
+ dout(10) << __func__ << dendl;
+
+ f->open_object_section("scrub");
+ f->dump_stream("scrubber.epoch_start") << m_epoch_start;
+ f->dump_bool("scrubber.active", m_active);
+ f->dump_stream("scrubber.start") << m_start;
+ f->dump_stream("scrubber.end") << m_end;
+ f->dump_stream("scrubber.m_max_end") << m_max_end;
+ f->dump_stream("scrubber.m_subset_last_update") << m_subset_last_update;
+ f->dump_bool("scrubber.deep", m_is_deep);
+ {
+ f->open_array_section("scrubber.waiting_on_whom");
+ for (const auto& p : m_maps_status.get_awaited()) {
+ f->dump_stream("shard") << p;
+ }
+ f->close_section();
+ }
+
+ f->dump_string("comment", "DEPRECATED - may be removed in the next release");
+
+ f->close_section();
+}
+
+PgScrubber::~PgScrubber()
+{
+ dout(10) << __func__ << dendl;
+}
+
+PgScrubber::PgScrubber(PG* pg)
+ : m_pg{pg}
+ , m_pg_id{pg->pg_id}
+ , m_osds{m_pg->osd}
+ , m_pg_whoami{pg->pg_whoami}
+ , m_epoch_queued{0}
+ , preemption_data{pg}
+{
+ dout(20) << " creating PgScrubber for " << pg->pg_id << " / " << m_pg_whoami << dendl;
+ m_fsm = std::make_unique<ScrubMachine>(m_pg, this);
+ m_fsm->initiate();
+}
+
+void PgScrubber::reserve_replicas()
+{
+ dout(10) << __func__ << dendl;
+ m_reservations.emplace(m_pg, m_pg_whoami);
+}
+
+// called only for normal end-of-scrub, and only for a Primary
+void PgScrubber::cleanup_on_finish()
+{
+ dout(10) << __func__ << dendl;
+ ceph_assert(m_pg->is_locked());
+
+ state_clear(PG_STATE_SCRUBBING);
+ state_clear(PG_STATE_DEEP_SCRUB);
+ m_pg->publish_stats_to_osd();
+
+ m_reservations.reset();
+ m_local_osd_resource.reset();
+
+ m_pg->requeue_ops(m_pg->waiting_for_scrub);
+
+ reset_internal_state();
+ // type-specific state clear
+ _scrub_clear_state();
+}
+
+// uses process_event(), so must be invoked externally
+void PgScrubber::scrub_clear_state(bool keep_repair_state)
+{
+ dout(10) << __func__ << dendl;
+
+ clear_pgscrub_state(keep_repair_state);
+ m_fsm->process_event(FullReset{});
+}
+
+/*
+ * note: does not access the state-machine
+ */
+void PgScrubber::clear_pgscrub_state(bool keep_repair_state)
+{
+ dout(10) << __func__ << dendl;
+ ceph_assert(m_pg->is_locked());
+
+ state_clear(PG_STATE_SCRUBBING);
+ state_clear(PG_STATE_DEEP_SCRUB);
+ if (!keep_repair_state)
+ state_clear(PG_STATE_REPAIR);
+
+ clear_scrub_reservations();
+ m_pg->publish_stats_to_osd();
+
+ m_pg->requeue_ops(m_pg->waiting_for_scrub);
+
+ reset_internal_state();
+
+ // type-specific state clear
+ _scrub_clear_state();
+}
+
+void PgScrubber::replica_handling_done()
+{
+ dout(10) << __func__ << dendl;
+
+ state_clear(PG_STATE_SCRUBBING);
+ state_clear(PG_STATE_DEEP_SCRUB);
+
+ // make sure we cleared the reservations!
+
+ preemption_data.reset();
+ m_maps_status.reset();
+ m_received_maps.clear();
+
+ m_start = hobject_t{};
+ m_end = hobject_t{};
+ m_max_end = hobject_t{};
+ m_subset_last_update = eversion_t{};
+ m_shallow_errors = 0;
+ m_deep_errors = 0;
+ m_fixed_count = 0;
+ m_omap_stats = (const struct omap_stat_t){0};
+
+ run_callbacks();
+ m_inconsistent.clear();
+ m_missing.clear();
+ m_authoritative.clear();
+ num_digest_updates_pending = 0;
+ replica_scrubmap = ScrubMap{};
+ replica_scrubmap_pos.reset();
+
+ m_cleaned_meta_map = ScrubMap{};
+ m_needs_sleep = true;
+ m_sleep_started_at = utime_t{};
+
+ m_active = false;
+ m_pg->publish_stats_to_osd();
+}
+
+/*
+ * note: performs run_callbacks()
+ * note: reservations-related variables are not reset here
+ */
+void PgScrubber::reset_internal_state()
+{
+ dout(10) << __func__ << dendl;
+
+ preemption_data.reset();
+ m_maps_status.reset();
+ m_received_maps.clear();
+
+ m_start = hobject_t{};
+ m_end = hobject_t{};
+ m_max_end = hobject_t{};
+ m_subset_last_update = eversion_t{};
+ m_shallow_errors = 0;
+ m_deep_errors = 0;
+ m_fixed_count = 0;
+ m_omap_stats = (const struct omap_stat_t){0};
+
+ run_callbacks();
+
+ m_inconsistent.clear();
+ m_missing.clear();
+ m_authoritative.clear();
+ num_digest_updates_pending = 0;
+ m_primary_scrubmap = ScrubMap{};
+ m_primary_scrubmap_pos.reset();
+ replica_scrubmap = ScrubMap{};
+ replica_scrubmap_pos.reset();
+ m_cleaned_meta_map = ScrubMap{};
+ m_needs_sleep = true;
+ m_sleep_started_at = utime_t{};
+
+ m_flags = scrub_flags_t{};
+
+ m_active = false;
+}
+
+const OSDMapRef& PgScrubber::get_osdmap() const
+{
+ return m_pg->get_osdmap();
+}
+
+ostream& operator<<(ostream& out, const PgScrubber& scrubber)
+{
+ return out << scrubber.m_flags;
+}
+
+ostream& PgScrubber::show(ostream& out) const
+{
+ return out << " [ " << m_pg_id << ": " << /*for now*/ m_flags << " ] ";
+}
+
// ///////////////////// preemption_data_t //////////////////////////////////
PgScrubber::preemption_data_t::preemption_data_t(PG* pg) : m_pg{pg}
} // namespace Scrub
-// an almost-empty PgScrubber for this commit:
+/**
+ * the scrub operation flags. Primary only.
+ * Set at scrub start. Checked in multiple locations - mostly
+ * at finish.
+ */
+struct scrub_flags_t {
+
+ unsigned int priority{0};
+
+ /**
+ * set by queue_scrub() if either planned_scrub.auto_repair or
+ * need_auto were set.
+ * Tested at scrub end.
+ */
+ bool auto_repair{false};
+
+ /// this flag indicates that we are scrubbing post repair to verify everything is fixed
+ bool check_repair{false};
+
+ /// checked at the end of the scrub, to possibly initiate a deep-scrub
+ bool deep_scrub_on_error{false};
+
+ /**
+ * scrub must not be aborted.
+ * Set for explicitly requested scrubs, and for scrubs originated by the pairing
+ * process with the 'repair' flag set (in the RequestScrub event).
+ */
+ bool required{false};
+};
+
+ostream& operator<<(ostream& out, const scrub_flags_t& sf);
+
+
+/**
+ * The part of PG-scrubbing code that isn't state-machine wiring.
+ *
+ * Why the separation? I wish to move to a different FSM implementation. Thus I
+ * am forced to strongly decouple the state-machine implementation details from
+ * the actual scrubbing code.
+ */
class PgScrubber : public ScrubPgIF, public ScrubMachineListener {
+ public:
+ explicit PgScrubber(PG* pg);
+
+ // ------------------ the I/F exposed to the PG (ScrubPgIF) -------------
+
+ /// are we waiting for resource reservation grants form our replicas?
+ [[nodiscard]] bool is_reserving() const final;
+
+ void send_start_scrub() final;
+
+ void send_start_after_repair() final;
+
+ void send_scrub_resched() final;
+
+ void active_pushes_notification() final;
+
+ void update_applied_notification(epoch_t epoch_queued) final;
+
+ void send_scrub_unblock() final;
+
+ void digest_update_notification() final;
+
+ void send_replica_maps_ready() final;
+
+ void send_replica_pushes_upd() final;
+
+ void reset_epoch(epoch_t epoch_queued) final;
+
+ /**
+ * we allow some number of preemptions of the scrub, which mean we do
+ * not block. Then we start to block. Once we start blocking, we do
+ * not stop until the scrub range is completed.
+ */
+ bool write_blocked_by_scrub(const hobject_t& soid) final;
+
+ /// true if the given range intersects the scrub interval in any way
+ bool range_intersects_scrub(const hobject_t& start, const hobject_t& end) final;
+
+ void handle_scrub_reserve_request(OpRequestRef op) final;
+ void handle_scrub_reserve_grant(OpRequestRef op, pg_shard_t from) final;
+ void handle_scrub_reserve_reject(OpRequestRef op, pg_shard_t from) final;
+ void handle_scrub_reserve_release(OpRequestRef op) final;
+ void clear_scrub_reservations() final; // PG::clear... fwds to here
+ void unreserve_replicas() final;
+
+ // managing scrub op registration
+
+ void reg_next_scrub(const requested_scrub_t& request_flags) final;
+
+ void unreg_next_scrub() final;
+
+ void scrub_requested(scrub_level_t scrub_level,
+ scrub_type_t scrub_type,
+ requested_scrub_t& req_flags) final;
+
+ /**
+ * Reserve local scrub resources (managed by the OSD)
+ *
+ * Fails if OSD's local-scrubs budget was exhausted
+ * \returns were local resources reserved?
+ */
+ bool reserve_local() final;
+
+ void handle_query_state(ceph::Formatter* f) final;
+
+ void dump(ceph::Formatter* f) const override;
+
+ // used if we are a replica
+
+ void replica_scrub_op(OpRequestRef op) final;
+ void replica_scrub(epoch_t epoch_queued) final;
+ void replica_scrub_resched(epoch_t epoch_queued) final;
+
+ /// the op priority, taken from the primary's request message
+ Scrub::scrub_prio_t replica_op_priority() const final
+ {
+ return m_replica_request_priority;
+ };
+
+ unsigned int scrub_requeue_priority(Scrub::scrub_prio_t with_priority,
+ unsigned int suggested_priority) const final;
+ /// the version that refers to m_flags.priority
+ unsigned int scrub_requeue_priority(Scrub::scrub_prio_t with_priority) const final;
+
+ void add_callback(Context* context) final { m_callbacks.push_back(context); }
+
+ [[nodiscard]] bool are_callbacks_pending() const final // used for an assert in PG.cc
+ {
+ return !m_callbacks.empty();
+ }
+
+ /// handle a message carrying a replica map
+ void map_from_replica(OpRequestRef op) final;
+
+ /**
+ * should we requeue blocked ops?
+ * Applicable to the PrimaryLogScrub derived class.
+ */
+ [[nodiscard]] virtual bool should_requeue_blocked_ops(
+ eversion_t last_recovery_applied) const override
+ {
+ return false;
+ }
+
+ void scrub_clear_state(bool keep_repair_state = false) final;
+
+ /**
+ * add to scrub statistics, but only if the soid is below the scrub start
+ */
+ virtual void stats_of_handled_objects(const object_stat_sum_t& delta_stats,
+ const hobject_t& soid) override
+ {
+ ceph_assert(false);
+ }
+
+ /**
+ * finalize the parameters of the initiated scrubbing session:
+ *
+ * The "current scrub" flags (m_flags) are set from the 'planned_scrub' flag-set;
+ * PG_STATE_SCRUBBING, and possibly PG_STATE_DEEP_SCRUB & PG_STATE_REPAIR are set.
+ */
+ void set_op_parameters(requested_scrub_t& request) final;
+
+ void cleanup_store(ObjectStore::Transaction* t) final;
+
+ bool get_store_errors(const scrub_ls_arg_t& arg,
+ scrub_ls_result_t& res_inout) const override
+ {
+ return false;
+ };
+
+ // -------------------------------------------------------------------------------------------
+ // the I/F used by the state-machine (i.e. the implementation of ScrubMachineListener)
+
+ bool select_range() final;
+
+ /// walk the log to find the latest update that affects our chunk
+ eversion_t search_log_for_updates() const final;
+
+ eversion_t get_last_update_applied() const final
+ {
+ return m_pg->recovery_state.get_last_update_applied();
+ }
+
+ void requeue_waiting() const final { m_pg->requeue_ops(m_pg->waiting_for_scrub); }
+
+ int pending_active_pushes() const final { return m_pg->active_pushes; }
+
+ void scrub_compare_maps() final;
+
+ void on_init() final;
+ void on_replica_init() final;
+ void replica_handling_done() final;
+
+ /// the version of 'scrub_clear_state()' that does not try to invoke FSM services
+ /// (thus can be called from FSM reactions)
+ void clear_pgscrub_state(bool keep_repair_state) final;
+
+ void add_delayed_scheduling() final;
+
+ /**
+ * @returns have we asked at least one replica?
+ * 'false' means we are configured with no replicas, and
+ * should expect no maps to arrive.
+ */
+ bool get_replicas_maps(bool replica_can_preempt) final;
+
+ Scrub::FsmNext on_digest_updates() final;
+
+ void send_replica_map(bool was_preempted) final;
+
+ void send_remotes_reserved() final;
+ void send_reservation_failure() final;
+
+ /**
+ * does the PG have newer updates than what we (the scrubber) know?
+ */
+ [[nodiscard]] bool has_pg_marked_new_updates() const final;
+
+ void set_subset_last_update(eversion_t e) final;
+
+ void replica_update_start_epoch() final;
+
+ void maps_compare_n_cleanup() final;
+
+ Scrub::preemption_t* get_preemptor() final;
+
+ int build_primary_map_chunk() final;
+
+ int build_replica_map_chunk() final;
+
+ void reserve_replicas() final;
+
+ [[nodiscard]] bool was_epoch_changed() const final;
+
+ void mark_local_map_ready() final;
+
+ [[nodiscard]] bool are_all_maps_available() const final;
+
+ std::string dump_awaited_maps() const final;
+
+ protected:
+ bool state_test(uint64_t m) const { return m_pg->state_test(m); }
+ void state_set(uint64_t m) { m_pg->state_set(m); }
+ void state_clear(uint64_t m) { m_pg->state_clear(m); }
+
+ [[nodiscard]] bool is_primary() const { return m_pg->recovery_state.is_primary(); }
+
+ [[nodiscard]] bool is_scrub_registered() const;
+
+ virtual void _scrub_clear_state() {}
+
+ utime_t m_scrub_reg_stamp; ///< stamp we registered for
+
+ ostream& show(ostream& out) const override;
+
+ public:
+ // -------------------------------------------------------------------------------------------
+
+ friend ostream& operator<<(ostream& out, const PgScrubber& scrubber);
+
+ static utime_t scrub_must_stamp() { return utime_t(1, 1); }
+
+ virtual ~PgScrubber(); // must be defined separately, in the .cc file
+
+ [[nodiscard]] bool is_scrub_active() const final;
+
+ private:
+ void reset_internal_state();
+
+ void _scan_snaps(ScrubMap& smap); // note that the (non-standard for a
+ // non-virtual) name of the function is searched
+ // for by the QA standalone tests. Do not modify.
+
+ void clean_meta_map(ScrubMap& for_meta_scrub);
+
+ void run_callbacks();
+
+ /**
+ * are we still a clean & healthy scrubbing primary?
+ *
+ * relevant only after the initial sched_scrub
+ */
+ [[nodiscard]] bool is_event_relevant(epoch_t queued) const;
+
+ /**
+ * check the 'no scrub' configuration options.
+ */
+ [[nodiscard]] bool should_abort_scrub(epoch_t queued) const;
+
+ void send_epoch_changed();
+
+ /**
+ * return true if any inconsistency/missing is repaired, false otherwise
+ */
+ [[nodiscard]] bool scrub_process_inconsistent();
+
+ bool m_needs_sleep{true}; ///< should we sleep before being rescheduled? always
+ ///< 'true', unless we just got out of a sleep period
+
+
+ // 'optional', as 'ReplicaReservations' & 'LocalReservation' are 'RAII-designed'
+ // to guarantee un-reserving when deleted.
+ std::optional<Scrub::ReplicaReservations> m_reservations;
+ std::optional<Scrub::LocalReservation> m_local_osd_resource;
+
+ /// the 'remote' resource we, as a replica, grant our Primary when it is scrubbing
+ std::optional<Scrub::ReservedByRemotePrimary> m_remote_osd_resource;
+
+ void cleanup_on_finish(); // scrub_clear_state() as called for a Primary when
+ // Active->NotActive
+
+ /// the part that actually finalizes a scrub
+ void scrub_finish();
+
+ utime_t m_sleep_started_at;
+
+ protected:
+ PG* const m_pg;
+
+ /**
+ * the derivative-specific scrub-finishing touches:
+ */
+ virtual void _scrub_finish() {}
+
+ /**
+ * Validate consistency of the object info and snap sets.
+ */
+ virtual void scrub_snapshot_metadata(ScrubMap& map, const missing_map_t& missing_digest)
+ {}
+
+ // common code used by build_primary_map_chunk() and build_replica_map_chunk():
+ int build_scrub_map_chunk(ScrubMap& map, // primary or replica?
+ ScrubMapBuilder& pos,
+ hobject_t start,
+ hobject_t end,
+ bool deep);
+
+ std::unique_ptr<Scrub::ScrubMachine> m_fsm;
+ const spg_t m_pg_id; ///< a local copy of m_pg->pg_id
+ OSDService* const m_osds;
+ const pg_shard_t m_pg_whoami; ///< a local copy of m_pg->pg_whoami;
+
+ epoch_t m_epoch_start; ///< epoch when scrubbing was first scheduled
+ epoch_t m_epoch_queued;
+ scrub_flags_t m_flags;
+
+ bool m_active{false};
+
+ eversion_t m_subset_last_update;
+
+ std::unique_ptr<Scrub::Store> m_store;
+
+ int num_digest_updates_pending{0};
+ hobject_t m_start, m_end; ///< note: half-closed: [start,end)
+
+ /// Returns reference to current osdmap
+ const OSDMapRef& get_osdmap() const;
+
+ /// Returns epoch of current osdmap
+ epoch_t get_osdmap_epoch() const { return get_osdmap()->get_epoch(); }
+
+ CephContext* get_pg_cct() const { return m_pg->cct; }
+
+ void send_start_replica();
+
+ void send_sched_replica();
+
+ // collected statistics
+ int m_shallow_errors{0};
+ int m_deep_errors{0};
+ int m_fixed_count{0};
+
+ /// Maps from objects with errors to missing peers
+ HobjToShardSetMapping m_missing;
+
+ private:
+ /**
+ * 'm_is_deep' - is the running scrub a deep one?
+ *
+ * Note that most of the code directly checks PG_STATE_DEEP_SCRUB, which is
+ * primary-only (and is set earlier - when scheduling the scrub). 'm_is_deep' is
+ * meaningful both for the primary and the replicas, and is used as a parameter when
+ * building the scrub maps.
+ */
+ bool m_is_deep{false};
+
+ inline static int fake_count{2}; // unit-tests. To be removed
+
+ /**
+ * initiate a deep-scrub after the current scrub ended with errors.
+ */
+ void request_rescrubbing(requested_scrub_t& req_flags);
+
+ std::list<Context*> m_callbacks;
+
+ /**
+ * send a replica (un)reservation request to the acting set
+ *
+ * @param opcode - one of MOSDScrubReserve::REQUEST
+ * or MOSDScrubReserve::RELEASE
+ */
+ void message_all_replicas(int32_t opcode, std::string_view op_text);
+
+ hobject_t m_max_end; ///< Largest end that may have been sent to replicas
+ ScrubMap m_primary_scrubmap;
+ ScrubMapBuilder m_primary_scrubmap_pos;
+
+ std::map<pg_shard_t, ScrubMap> m_received_maps;
+
+ /// Cleaned std::map pending snap metadata scrub
+ ScrubMap m_cleaned_meta_map;
+
+ void _request_scrub_map(pg_shard_t replica,
+ eversion_t version,
+ hobject_t start,
+ hobject_t end,
+ bool deep,
+ bool allow_preemption);
+
+
+ Scrub::MapsCollectionStatus m_maps_status;
+
+ omap_stat_t m_omap_stats = (const struct omap_stat_t){0};
+
+ /// Maps from objects with errors to inconsistent peers
+ HobjToShardSetMapping m_inconsistent;
+
+ /// Maps from object with errors to good peers
+ std::map<hobject_t, std::list<std::pair<ScrubMap::object, pg_shard_t>>> m_authoritative;
+
+ // ------------ members used if we are a replica
+
+ epoch_t m_replica_epoch_start;
+ epoch_t m_replica_min_epoch; ///< the min epoch needed to handle this message
+
+ ScrubMapBuilder replica_scrubmap_pos; /// \todo document
+ ScrubMap replica_scrubmap; /// \todo document
+ /**
+ * we mark the request priority as it arrived. It influences the queuing priority
+ * when we wait for local updates
+ */
+ Scrub::scrub_prio_t m_replica_request_priority;
+
+ /**
+ * Queue a XX event to be sent to the replica, to trigger a re-check of the
+ * availability of the scrub map prepared by the backend.
+ */
+ void requeue_replica(Scrub::scrub_prio_t is_high_priority);
+
/**
* the 'preemption' "state-machine".
* Note: I was considering an orthogonal sub-machine implementation, but as
return m_left > 0;
}
};
+
+ preemption_data_t preemption_data;
+
+ // debug/development temporary code:
+ void debug_dump_reservations(std::string_view header_txt) const;
};
pg->unlock();
}
+void PGScrub::run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle)
+{
+ pg->scrub(epoch_queued, handle);
+ pg->unlock();
+}
+
+void PGScrubAfterRepair::run(OSD* osd,
+ OSDShard* sdata,
+ PGRef& pg,
+ ThreadPool::TPHandle& handle)
+{
+ pg->recovery_scrub(epoch_queued, handle);
+ pg->unlock();
+}
+
+void PGScrubResched::run(OSD* osd,
+ OSDShard* sdata,
+ PGRef& pg,
+ ThreadPool::TPHandle& handle)
+{
+ pg->scrub_send_scrub_resched(epoch_queued, handle);
+ pg->unlock();
+}
+
void PGScrubResourcesOK::run(OSD* osd,
OSDShard* sdata,
PGRef& pg,
pg->unlock();
}
-void PGScrub::run(
- OSD *osd,
- OSDShard *sdata,
- PGRef& pg,
- ThreadPool::TPHandle &handle)
+void PGScrubPushesUpdate::run(OSD* osd,
+ OSDShard* sdata,
+ PGRef& pg,
+ ThreadPool::TPHandle& handle)
{
- pg->scrub(epoch_queued, handle);
+ pg->scrub_send_pushes_update(epoch_queued, handle);
+ pg->unlock();
+}
+
+void PGScrubAppliedUpdate::run(OSD* osd,
+ OSDShard* sdata,
+ PGRef& pg,
+ ThreadPool::TPHandle& handle)
+{
+ pg->scrub_send_applied_update(epoch_queued, handle);
+ pg->unlock();
+}
+
+void PGScrubUnblocked::run(OSD* osd,
+ OSDShard* sdata,
+ PGRef& pg,
+ ThreadPool::TPHandle& handle)
+{
+ pg->scrub_send_unblocking(epoch_queued, handle);
+ pg->unlock();
+}
+
+void PGScrubDigestUpdate::run(OSD* osd,
+ OSDShard* sdata,
+ PGRef& pg,
+ ThreadPool::TPHandle& handle)
+{
+ pg->scrub_send_digest_update(epoch_queued, handle);
+ pg->unlock();
+}
+
+void PGScrubGotReplMaps::run(OSD* osd,
+ OSDShard* sdata,
+ PGRef& pg,
+ ThreadPool::TPHandle& handle)
+{
+ pg->scrub_send_replmaps_ready(epoch_queued, handle);
+ pg->unlock();
+}
+
+void PGRepScrub::run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle)
+{
+ pg->replica_scrub(epoch_queued, handle);
+ pg->unlock();
+}
+
+void PGRepScrubResched::run(OSD* osd,
+ OSDShard* sdata,
+ PGRef& pg,
+ ThreadPool::TPHandle& handle)
+{
+ pg->replica_scrub_resched(epoch_queued, handle);
+ pg->unlock();
+}
+
+void PGScrubReplicaPushes::run([[maybe_unused]] OSD* osd,
+ OSDShard* sdata,
+ PGRef& pg,
+ ThreadPool::TPHandle& handle)
+{
+ pg->scrub_send_replica_pushes(epoch_queued, handle);
pg->unlock();
}
}
};
+class PGScrubResched : public PGScrubItem {
+ public:
+ PGScrubResched(spg_t pg, epoch_t epoch_queued)
+ : PGScrubItem{pg, epoch_queued, "PGScrubResched"}
+ {}
+ void run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle) final;
+};
+
/**
* all replicas have granted our scrub resources request
*/
void run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle) final;
};
+/**
+ * called when a repair process completes, to initiate scrubbing. No local/remote
+ * resources are allocated.
+ */
+class PGScrubAfterRepair : public PGScrubItem {
+ public:
+ PGScrubAfterRepair(spg_t pg, epoch_t epoch_queued)
+ : PGScrubItem{pg, epoch_queued, "PGScrubAfterRepair"}
+ {}
+ void run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle) final;
+};
+
+class PGScrubPushesUpdate : public PGScrubItem {
+ public:
+ PGScrubPushesUpdate(spg_t pg, epoch_t epoch_queued)
+ : PGScrubItem{pg, epoch_queued, "PGScrubPushesUpdate"}
+ {}
+ void run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle) final;
+};
+
+class PGScrubAppliedUpdate : public PGScrubItem {
+ public:
+ PGScrubAppliedUpdate(spg_t pg, epoch_t epoch_queued)
+ : PGScrubItem{pg, epoch_queued, "PGScrubAppliedUpdate"}
+ {}
+ void run(OSD* osd,
+ OSDShard* sdata,
+ PGRef& pg,
+ [[maybe_unused]] ThreadPool::TPHandle& handle) final;
+};
+
+class PGScrubUnblocked : public PGScrubItem {
+ public:
+ PGScrubUnblocked(spg_t pg, epoch_t epoch_queued)
+ : PGScrubItem{pg, epoch_queued, "PGScrubUnblocked"}
+ {}
+ void run(OSD* osd,
+ OSDShard* sdata,
+ PGRef& pg,
+ [[maybe_unused]] ThreadPool::TPHandle& handle) final;
+};
+
+class PGScrubDigestUpdate : public PGScrubItem {
+ public:
+ PGScrubDigestUpdate(spg_t pg, epoch_t epoch_queued)
+ : PGScrubItem{pg, epoch_queued, "PGScrubDigestUpdate"}
+ {}
+ void run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle) final;
+};
+
+class PGScrubGotReplMaps : public PGScrubItem {
+ public:
+ PGScrubGotReplMaps(spg_t pg, epoch_t epoch_queued)
+ : PGScrubItem{pg, epoch_queued, "PGScrubGotReplMaps"}
+ {}
+ void run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle) final;
+};
+
+class PGRepScrub : public PGScrubItem {
+ public:
+ PGRepScrub(spg_t pg, epoch_t epoch_queued) : PGScrubItem{pg, epoch_queued, "PGRepScrub"}
+ {}
+ void run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle) final;
+};
+
+class PGRepScrubResched : public PGScrubItem {
+ public:
+ PGRepScrubResched(spg_t pg, epoch_t epoch_queued)
+ : PGScrubItem{pg, epoch_queued, "PGRepScrubResched"}
+ {}
+ void run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle) final;
+};
+
+class PGScrubReplicaPushes : public PGScrubItem {
+ public:
+ PGScrubReplicaPushes(spg_t pg, epoch_t epoch_queued)
+ : PGScrubItem{pg, epoch_queued, "PGScrubReplicaPushes"}
+ {}
+ void run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle) final;
+};
+
class PGRecovery : public PGOpQueueable {
epoch_t epoch_queued;
uint64_t reserved_pushes;