dout(20) << "merge_old_entry had " << oe << " updating missing to "
<< oe.prior_version << dendl;
if (oe.prior_version > eversion_t()) {
+ ondisklog.add_divergent_prior(oe.prior_version, oe.soid);
+ dirty_log = true;
missing.revise_need(oe.soid, oe.prior_version);
} else if (missing.is_missing(oe.soid)) {
missing.rm(oe.soid, missing.missing[oe.soid].need);
{
// trim?
if (trim_to > log.tail) {
+ /* If we are trimming, we must be complete up to trim_to, time
+ * to throw out any divergent_priors
+ */
+ ondisklog.divergent_priors.clear();
// We shouldn't be trimming the log past last_complete
assert(trim_to <= info.last_complete);
missing.add(i->soid, i->version, eversion_t());
}
}
+ for (map<eversion_t, hobject_t>::reverse_iterator i =
+ ondisklog.divergent_priors.rbegin();
+ i != ondisklog.divergent_priors.rend();
+ ++i) {
+ if (i->first <= info.last_complete) break;
+ if (did.count(i->second)) continue;
+ did.insert(i->second);
+ bufferlist bv;
+ int r = osd->store->getattr(coll, i->second, OI_ATTR, bv);
+ if (r >= 0) {
+ object_info_t oi(bv);
+ /**
+ * 1) we see this entry in the divergent priors mapping
+ * 2) we didn't see an entry for this object in the log
+ *
+ * From 1 & 2 we know that either the object does not exist
+ * or it is at the version specified in the divergent_priors
+ * map since the object would have been deleted atomically
+ * with the addition of the divergent_priors entry, an older
+ * version would not have been recovered, and a newer version
+ * would show up in the log above.
+ */
+ assert(oi.version == i->first);
+ } else {
+ dout(15) << "read_log missing " << *i << dendl;
+ missing.add(i->second, i->first, eversion_t());
+ }
+ }
}
dout(10) << "read_log done" << dendl;
}
uint64_t zero_to; // first non-zeroed byte of log.
bool has_checksums;
+ /**
+ * We reconstruct the missing set by comparing the recorded log against
+ * the objects in the pg collection. Unfortunately, it's possible to
+ * have an object in the missing set which is not in the log due to
+ * a divergent operation with a prior_version pointing before the
+ * pg log tail. To deal with this, we store alongside the log a mapping
+ * of divergent priors to be checked along with the log during read_state.
+ */
+ map<eversion_t, hobject_t> divergent_priors;
+ void add_divergent_prior(eversion_t version, hobject_t obj) {
+ divergent_priors.insert(make_pair(version, obj));
+ }
+
OndiskLog() : tail(0), head(0), zero_to(0),
has_checksums(true) {}
}
void encode(bufferlist& bl) const {
- ENCODE_START(4, 3, bl);
+ ENCODE_START(5, 3, bl);
::encode(tail, bl);
::encode(head, bl);
::encode(zero_to, bl);
+ ::encode(divergent_priors, bl);
ENCODE_FINISH(bl);
}
void decode(bufferlist::iterator& bl) {
::decode(zero_to, bl);
else
zero_to = 0;
+ if (struct_v >= 5)
+ ::decode(divergent_priors, bl);
DECODE_FINISH(bl);
}
void dump(Formatter *f) const {