We use pg_log.missing to track each peer's missing objects separately,
whereas missing_loc records the location of all (probably existing) good copies
for both primary and replicas' missing objects. Hence an item from
pg_log.missing or missing_loc is of different meaning and is not comparable.
During recovery, we can skip recovering primary only if
- primary is good, e.g., has no missing at all
- or all of the primary's missing objects do exist in missing_loc and are
currently unfound
Obviously, the current "all missing objects are unfound" checker is broken.
Fix by introducing an independent all_missing_unfound helper to make the
count of missing objects that are currently unfound correct.
Fixes: http://tracker.ceph.com/issues/38784
Signed-off-by: xie xingguo <xie.xingguo@zte.com.cn>
(cherry picked from commit
d9497139a6f516ce015bb43a2e7f8958638cf8f8)
uint64_t get_num_unfound() const {
return missing_loc.num_unfound();
}
+ bool all_missing_unfound() const {
+ const auto& missing = pg_log.get_missing();
+ if (!missing.have_missing())
+ return false;
+ for (auto& m : missing.get_items()) {
+ if (!missing_loc.is_unfound(m.first))
+ return false;
+ }
+ return true;
+ }
virtual void check_local() = 0;
const auto &missing = pg_log.get_missing();
- unsigned int num_missing = missing.num_missing();
uint64_t num_unfound = get_num_unfound();
- if (num_missing == 0) {
+ if (!missing.have_missing()) {
info.last_complete = info.last_update;
}
- if (num_missing == num_unfound) {
- // All of the missing objects we have are unfound.
+ if (!missing.have_missing() || // Primary does not have missing
+ all_missing_unfound()) { // or all of the missing objects are unfound.
// Recover the replicas.
started = recover_replicas(max, handle, &recovery_started);
}