<< " info stats: " << (info.stats.stats_invalid ? "invalid" : "valid")
<< dendl;
- bool repair = state_test(PG_STATE_REPAIR);
- bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB);
- const char* mode = (repair ? "repair" : (deep_scrub ? "deep-scrub" : "scrub"));
-
if (info.stats.stats_invalid) {
m_pl_pg->recovery_state.update_stats([=](auto& history, auto& stats) {
stats.stats = m_scrub_cstat;
m_pl_pg->agent_choose_mode();
}
- dout(10) << mode << " got " << m_scrub_cstat.sum.num_objects << "/"
+ dout(10) << m_mode_desc << " got " << m_scrub_cstat.sum.num_objects << "/"
<< info.stats.stats.sum.num_objects << " objects, "
<< m_scrub_cstat.sum.num_object_clones << "/"
<< info.stats.stats.sum.num_object_clones << " clones, "
!info.stats.manifest_stats_invalid) ||
m_scrub_cstat.sum.num_whiteouts != info.stats.stats.sum.num_whiteouts ||
m_scrub_cstat.sum.num_bytes != info.stats.stats.sum.num_bytes) {
- m_osds->clog->error() << info.pgid << " " << mode << " : stat mismatch, got "
+ m_osds->clog->error() << info.pgid << " " << m_mode_desc << " : stat mismatch, got "
<< m_scrub_cstat.sum.num_objects << "/"
<< info.stats.stats.sum.num_objects << " objects, "
<< m_scrub_cstat.sum.num_object_clones << "/"
<< " hit_set_archive bytes.";
++m_shallow_errors;
- if (repair) {
+ if (m_is_repair) {
++m_fixed_count;
m_pl_pg->recovery_state.update_stats([this](auto& history, auto& stats) {
stats.stats = m_scrub_cstat;
}
}
// Clear object context cache to get repair information
- if (repair)
+ if (m_is_repair)
m_pl_pg->object_contexts.clear();
}
LogChannelRef clog,
const spg_t& pgid,
const char* func,
- const char* mode,
bool allow_incomplete_clones)
{
ceph_assert(head);
if (allow_incomplete_clones) {
- dout(20) << func << " " << mode << " " << pgid << " " << *head << " skipped "
+ dout(20) << func << " " << m_mode_desc << " " << pgid << " " << *head << " skipped "
<< missing << " clone(s) in cache tier" << dendl;
} else {
- clog->info() << mode << " " << pgid << " " << *head << " : " << missing
+ clog->info() << m_mode_desc << " " << pgid << " " << *head << " : " << missing
<< " missing clone(s)";
}
}
const std::optional<SnapSet>& snapset,
LogChannelRef clog,
const spg_t& pgid,
- const char* mode,
bool allow_incomplete_clones,
std::optional<snapid_t> target,
vector<snapid_t>::reverse_iterator* curclone,
// skip higher-numbered clones in the list.
if (!allow_incomplete_clones) {
next_clone.snap = **curclone;
- clog->error() << mode << " " << pgid << " " << *head << " : expected clone "
+ clog->error() << m_mode_desc << " " << pgid << " " << *head << " : expected clone "
<< next_clone << " " << m_missing << " missing";
++m_shallow_errors;
e.set_clone_missing(next_clone.snap);
const PGPool& pool = m_pl_pg->pool;
bool allow_incomplete_clones = pool.info.allow_incomplete_clones();
- bool repair = state_test(PG_STATE_REPAIR);
- bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB);
- const char* mode = (repair ? "repair" : (deep_scrub ? "deep-scrub" : "scrub"));
-
std::optional<snapid_t> all_clones; // Unspecified snapid_t or std::nullopt
// traverse in reverse order.
// basic checks.
if (p->second.attrs.count(OI_ATTR) == 0) {
oi = std::nullopt;
- m_osds->clog->error() << mode << " " << info.pgid << " " << soid << " : no '"
+ m_osds->clog->error() << m_mode_desc << " " << info.pgid << " " << soid << " : no '"
<< OI_ATTR << "' attr";
++m_shallow_errors;
soid_error.set_info_missing();
oi->decode(bv);
} catch (ceph::buffer::error& e) {
oi = std::nullopt;
- m_osds->clog->error() << mode << " " << info.pgid << " " << soid
+ m_osds->clog->error() << m_mode_desc << " " << info.pgid << " " << soid
<< " : can't decode '" << OI_ATTR << "' attr " << e.what();
++m_shallow_errors;
soid_error.set_info_corrupted();
if (oi) {
if (m_pl_pg->pgbackend->be_get_ondisk_size(oi->size) != p->second.size) {
- m_osds->clog->error() << mode << " " << info.pgid << " " << soid
+ m_osds->clog->error() << m_mode_desc << " " << info.pgid << " " << soid
<< " : on disk size (" << p->second.size
<< ") does not match object info size (" << oi->size
<< ") adjusted for ondisk to ("
++m_shallow_errors;
}
- dout(20) << mode << " " << soid << " " << *oi << dendl;
+ dout(20) << m_mode_desc << " " << soid << " " << *oi << dendl;
// A clone num_bytes will be added later when we have snapset
if (!soid.is_snap()) {
// Expecting an object with snap for current head
if (soid.has_snapset() || soid.get_head() != head->get_head()) {
- dout(10) << __func__ << " " << mode << " " << info.pgid << " new object " << soid
+ dout(10) << __func__ << " " << m_mode_desc << " " << info.pgid << " new object " << soid
<< " while processing " << *head << dendl;
target = all_clones;
// Log any clones we were expecting to be there up to target
// This will set missing, but will be a no-op if snap.soid == *curclone.
missing +=
- process_clones_to(head, snapset, m_osds->clog, info.pgid, mode,
+ process_clones_to(head, snapset, m_osds->clog, info.pgid,
allow_incomplete_clones, target, &curclone, head_error);
}
if (!expected) {
// If we couldn't read the head's snapset, just ignore clones
if (head && !snapset) {
- m_osds->clog->error() << mode << " " << info.pgid << " " << soid
+ m_osds->clog->error() << m_mode_desc << " " << info.pgid << " " << soid
<< " : clone ignored due to missing snapset";
} else {
- m_osds->clog->error() << mode << " " << info.pgid << " " << soid
+ m_osds->clog->error() << m_mode_desc << " " << info.pgid << " " << soid
<< " : is an unexpected clone";
}
++m_shallow_errors;
if (soid.has_snapset()) {
if (missing) {
- log_missing(missing, head, m_osds->clog, info.pgid, __func__, mode,
+ log_missing(missing, head, m_osds->clog, info.pgid, __func__,
pool.info.allow_incomplete_clones());
}
head_error = soid_error;
soid_error_count = 0;
- dout(20) << __func__ << " " << mode << " new head " << head << dendl;
+ dout(20) << __func__ << " " << m_mode_desc << " new head " << head << dendl;
if (p->second.attrs.count(SS_ATTR) == 0) {
- m_osds->clog->error() << mode << " " << info.pgid << " " << soid << " : no '"
+ m_osds->clog->error() << m_mode_desc << " " << info.pgid << " " << soid << " : no '"
<< SS_ATTR << "' attr";
++m_shallow_errors;
snapset = std::nullopt;
} catch (ceph::buffer::error& e) {
snapset = std::nullopt;
m_osds->clog->error()
- << mode << " " << info.pgid << " " << soid << " : can't decode '" << SS_ATTR
+ << m_mode_desc << " " << info.pgid << " " << soid << " : can't decode '" << SS_ATTR
<< "' attr " << e.what();
++m_shallow_errors;
head_error.set_snapset_corrupted();
dout(20) << " snapset " << *snapset << dendl;
if (snapset->seq == 0) {
m_osds->clog->error()
- << mode << " " << info.pgid << " " << soid << " : snaps.seq not set";
+ << m_mode_desc << " " << info.pgid << " " << soid << " : snaps.seq not set";
++m_shallow_errors;
head_error.set_snapset_error();
}
ceph_assert(snapset);
ceph_assert(soid.snap == *curclone);
- dout(20) << __func__ << " " << mode << " matched clone " << soid << dendl;
+ dout(20) << __func__ << " " << m_mode_desc << " matched clone " << soid << dendl;
if (snapset->clone_size.count(soid.snap) == 0) {
- m_osds->clog->error() << mode << " " << info.pgid << " " << soid
+ m_osds->clog->error() << m_mode_desc << " " << info.pgid << " " << soid
<< " : is missing in clone_size";
++m_shallow_errors;
soid_error.set_size_mismatch();
} else {
if (oi && oi->size != snapset->clone_size[soid.snap]) {
m_osds->clog->error()
- << mode << " " << info.pgid << " " << soid << " : size " << oi->size
+ << m_mode_desc << " " << info.pgid << " " << soid << " : size " << oi->size
<< " != clone_size " << snapset->clone_size[*curclone];
++m_shallow_errors;
soid_error.set_size_mismatch();
}
if (snapset->clone_overlap.count(soid.snap) == 0) {
- m_osds->clog->error() << mode << " " << info.pgid << " " << soid
+ m_osds->clog->error() << m_mode_desc << " " << info.pgid << " " << soid
<< " : is missing in clone_overlap";
++m_shallow_errors;
soid_error.set_size_mismatch();
}
if (bad_interval_set) {
- m_osds->clog->error() << mode << " " << info.pgid << " " << soid
+ m_osds->clog->error() << m_mode_desc << " " << info.pgid << " " << soid
<< " : bad interval_set in clone_overlap";
++m_shallow_errors;
soid_error.set_size_mismatch();
}
if (doing_clones(snapset, curclone)) {
- dout(10) << __func__ << " " << mode << " " << info.pgid
+ dout(10) << __func__ << " " << m_mode_desc << " " << info.pgid
<< " No more objects while processing " << *head << dendl;
missing +=
- process_clones_to(head, snapset, m_osds->clog, info.pgid, mode,
+ process_clones_to(head, snapset, m_osds->clog, info.pgid,
allow_incomplete_clones, all_clones, &curclone, head_error);
}
// There could be missing found by the test above or even
// before dropping out of the loop for the last head.
if (missing) {
- log_missing(missing, head, m_osds->clog, info.pgid, __func__, mode,
+ log_missing(missing, head, m_osds->clog, info.pgid, __func__,
allow_incomplete_clones);
}
if (head && (head_error.errors || soid_error_count))
ObjectContextRef obc = m_pl_pg->get_object_context(p->first, false);
if (!obc) {
- m_osds->clog->error() << info.pgid << " " << mode
+ m_osds->clog->error() << info.pgid << " " << m_mode_desc
<< " cannot get object context for object " << p->first;
continue;
}
if (obc->obs.oi.soid != p->first) {
- m_osds->clog->error() << info.pgid << " " << mode << " " << p->first
+ m_osds->clog->error() << info.pgid << " " << m_mode_desc << " " << p->first
<< " : object has a valid oi attr with a mismatched name, "
<< " obc->obs.oi.soid: " << obc->obs.oi.soid;
continue;
m_pl_pg->simple_opc_submit(std::move(ctx));
}
- dout(10) << __func__ << " (" << mode << ") finish" << dendl;
+ dout(10) << __func__ << " (" << m_mode_desc << ") finish" << dendl;
}
PrimaryLogScrub::PrimaryLogScrub(PrimaryLogPG* pg) : PgScrubber{pg}, m_pl_pg{pg} {}
m_epoch_start = epoch_queued;
m_needs_sleep = true;
m_is_deep = state_test(PG_STATE_DEEP_SCRUB);
+ update_op_mode_text();
}
unsigned int PgScrubber::scrub_requeue_priority(Scrub::scrub_prio_t with_priority) const
return m_maps_status.dump();
}
+void PgScrubber::update_op_mode_text()
+{
+ auto visible_repair = state_test(PG_STATE_REPAIR);
+ m_mode_desc = (visible_repair ? "repair"sv : (m_is_deep ? "deep-scrub"sv : "scrub"sv));
+
+ dout(10) << __func__ << ": repair: visible: " << (visible_repair ? "true" : "false")
+ << ", internal: " << (m_is_repair ? "true" : "false")
+ << ". Displayed: " << m_mode_desc << dendl;
+}
+
void PgScrubber::_request_scrub_map(pg_shard_t replica,
eversion_t version,
hobject_t start,
state_set(PG_STATE_DEEP_SCRUB);
}
- if (request.must_repair || m_flags.auto_repair) {
+ // m_is_repair is set for either 'must_repair' or 'repair-on-the-go' (i.e.
+ // deep-scrub with the auto_repair configuration flag set). m_is_repair value
+ // determines the scrubber behavior.
+ // PG_STATE_REPAIR, on the other hand, is only used for status reports (inc. the
+ // PG status as appearing in the logs).
+ m_is_repair = request.must_repair || m_flags.auto_repair;
+ if (request.must_repair) {
state_set(PG_STATE_REPAIR);
+ // not calling update_op_mode_text() yet, as m_is_deep not set yet
}
// the publishing here seems to be required for tests synchronization
ss.clear();
m_pg->get_pgbackend()->be_compare_scrubmaps(
- maps, master_set, state_test(PG_STATE_REPAIR), m_missing, m_inconsistent,
+ maps, master_set, m_is_repair, m_missing, m_inconsistent,
authoritative, missing_digest, m_shallow_errors, m_deep_errors, m_store.get(),
m_pg->info.pgid, m_pg->recovery_state.get_acting(), ss);
dout(2) << ss.str() << dendl;
if (!m_store->empty()) {
- if (state_test(PG_STATE_REPAIR)) {
+ if (m_is_repair) {
dout(10) << __func__ << ": discarding scrub results" << dendl;
m_store->flush(nullptr);
} else {
[[nodiscard]] bool PgScrubber::scrub_process_inconsistent()
{
- dout(10) << __func__ << ": checking authoritative" << dendl;
-
- bool repair = state_test(PG_STATE_REPAIR);
- const bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB);
- const char* mode = (repair ? "repair" : (deep_scrub ? "deep-scrub" : "scrub"));
- dout(20) << __func__ << " deep_scrub: " << deep_scrub << " m_is_deep: " << m_is_deep
- << " repair: " << repair << dendl;
+ dout(10) << __func__ << ": checking authoritative (mode="
+ << m_mode_desc << ", auth remaining #: " << m_authoritative.size()
+ << ")" << dendl;
// authoritative only store objects which are missing or inconsistent.
if (!m_authoritative.empty()) {
stringstream ss;
- ss << m_pg->info.pgid << " " << mode << " " << m_missing.size() << " missing, "
+ ss << m_pg->info.pgid << " " << m_mode_desc << " " << m_missing.size() << " missing, "
<< m_inconsistent.size() << " inconsistent objects";
dout(2) << ss.str() << dendl;
m_osds->clog->error(ss);
- if (repair) {
+ if (m_is_repair) {
state_clear(PG_STATE_CLEAN);
+ // we know we have a problem, so it's OK to set the user-visible flag
+ // even if we only reached here via auto-repair
+ state_set(PG_STATE_REPAIR);
+ update_op_mode_text();
for (const auto& [hobj, shrd_list] : m_authoritative) {
}
}
}
- return (!m_authoritative.empty() && repair);
+ return (!m_authoritative.empty() && m_is_repair);
}
/*
// if the repair request comes from auto-repair and large number of errors,
// we would like to cancel auto-repair
-
- bool repair = state_test(PG_STATE_REPAIR);
- if (repair && m_flags.auto_repair &&
+ if (m_is_repair && m_flags.auto_repair &&
m_authoritative.size() > m_pg->cct->_conf->osd_scrub_auto_repair_num_errors) {
dout(10) << __func__ << " undoing the repair" << dendl;
- state_clear(PG_STATE_REPAIR);
- repair = false;
+ state_clear(PG_STATE_REPAIR); // not expected to be set, anyway
+ m_is_repair = false;
+ update_op_mode_text();
}
- bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB);
- const char* mode = (repair ? "repair" : (deep_scrub ? "deep-scrub" : "scrub"));
bool do_auto_scrub = false;
// if a regular scrub had errors within the limit, do a deep scrub to auto repair
if (m_flags.deep_scrub_on_error && m_authoritative.size() &&
m_authoritative.size() <= m_pg->cct->_conf->osd_scrub_auto_repair_num_errors) {
- ceph_assert(!deep_scrub);
+ ceph_assert(!m_is_deep);
do_auto_scrub = true;
dout(15) << __func__ << " Try to auto repair after scrub errors" << dendl;
}
{
stringstream oss;
- oss << m_pg->info.pgid.pgid << " " << mode << " ";
+ oss << m_pg->info.pgid.pgid << " " << m_mode_desc << " ";
int total_errors = m_shallow_errors + m_deep_errors;
if (total_errors)
oss << total_errors << " errors";
else
oss << "ok";
- if (!deep_scrub && m_pg->info.stats.stats.sum.num_deep_scrub_errors)
+ if (!m_is_deep && m_pg->info.stats.stats.sum.num_deep_scrub_errors)
oss << " ( " << m_pg->info.stats.stats.sum.num_deep_scrub_errors
<< " remaining deep scrub error details lost)";
- if (repair)
+ if (m_is_repair)
oss << ", " << m_fixed_count << " fixed";
if (total_errors)
m_osds->clog->error(oss);
// Since we don't know which errors were fixed, we can only clear them
// when every one has been fixed.
- if (repair) {
+ if (m_is_repair) {
if (m_fixed_count == m_shallow_errors + m_deep_errors) {
- ceph_assert(deep_scrub);
+ ceph_assert(m_is_deep);
m_shallow_errors = 0;
m_deep_errors = 0;
dout(20) << __func__ << " All may be fixed" << dendl;
// finish up
ObjectStore::Transaction t;
m_pg->recovery_state.update_stats(
- [this, deep_scrub](auto& history, auto& stats) {
+ [this](auto& history, auto& stats) {
dout(10) << "m_pg->recovery_state.update_stats()" << dendl;
utime_t now = ceph_clock_now();
history.last_scrub = m_pg->recovery_state.get_info().last_update;
history.last_deep_scrub_stamp = now;
}
- if (deep_scrub) {
+ if (m_is_deep) {
if ((m_shallow_errors == 0) && (m_deep_errors == 0))
history.last_clean_scrub_stamp = now;
stats.stats.sum.num_shallow_scrub_errors = m_shallow_errors;
m_pg->queue_peering_event(PGPeeringEventRef(std::make_shared<PGPeeringEvent>(
get_osdmap_epoch(), get_osdmap_epoch(), PeeringState::DoRecovery())));
} else {
+ m_is_repair = false;
state_clear(PG_STATE_REPAIR);
+ update_op_mode_text();
}
cleanup_on_finish();