From 193e2ea532e9b18291ef2cf56b6423ce2d5487c2 Mon Sep 17 00:00:00 2001 From: Samuel Just Date: Mon, 5 Nov 2012 15:40:43 -0800 Subject: [PATCH] PG: persist divergent_priors in ondisklog Consider the following logs: a) 10'10(5'7) foo 12'11(4'3) bar b) 10'10(5'7) foo 13'11(4'4) baz When the osd with a merges primary log b, bar is deleted and added to the missing set with need=4'3 and have=0'0. If the osd then dies after deleting bar, but before recovering bar, PG::read_state() on start up will fail to re-add bar to the missing set, and bar will be incorrect on that osd. Now, (4'3, bar) will be added to the divergent_priors mapping to be scanned during read_state along with the log. Signed-off-by: Samuel Just --- src/osd/PG.cc | 34 ++++++++++++++++++++++++++++++++++ src/osd/PG.h | 18 +++++++++++++++++- 2 files changed, 51 insertions(+), 1 deletion(-) diff --git a/src/osd/PG.cc b/src/osd/PG.cc index 9513d252c2b97..6057178dcbfd5 100644 --- a/src/osd/PG.cc +++ b/src/osd/PG.cc @@ -410,6 +410,8 @@ bool PG::merge_old_entry(ObjectStore::Transaction& t, pg_log_entry_t& oe) dout(20) << "merge_old_entry had " << oe << " updating missing to " << oe.prior_version << dendl; if (oe.prior_version > eversion_t()) { + ondisklog.add_divergent_prior(oe.prior_version, oe.soid); + dirty_log = true; missing.revise_need(oe.soid, oe.prior_version); } else if (missing.is_missing(oe.soid)) { missing.rm(oe.soid, missing.missing[oe.soid].need); @@ -2214,6 +2216,10 @@ void PG::trim(ObjectStore::Transaction& t, eversion_t trim_to) { // trim? if (trim_to > log.tail) { + /* If we are trimming, we must be complete up to trim_to, time + * to throw out any divergent_priors + */ + ondisklog.divergent_priors.clear(); // We shouldn't be trimming the log past last_complete assert(trim_to <= info.last_complete); @@ -2490,6 +2496,34 @@ void PG::read_log(ObjectStore *store) missing.add(i->soid, i->version, eversion_t()); } } + for (map::reverse_iterator i = + ondisklog.divergent_priors.rbegin(); + i != ondisklog.divergent_priors.rend(); + ++i) { + if (i->first <= info.last_complete) break; + if (did.count(i->second)) continue; + did.insert(i->second); + bufferlist bv; + int r = osd->store->getattr(coll, i->second, OI_ATTR, bv); + if (r >= 0) { + object_info_t oi(bv); + /** + * 1) we see this entry in the divergent priors mapping + * 2) we didn't see an entry for this object in the log + * + * From 1 & 2 we know that either the object does not exist + * or it is at the version specified in the divergent_priors + * map since the object would have been deleted atomically + * with the addition of the divergent_priors entry, an older + * version would not have been recovered, and a newer version + * would show up in the log above. + */ + assert(oi.version == i->first); + } else { + dout(15) << "read_log missing " << *i << dendl; + missing.add(i->second, i->first, eversion_t()); + } + } } dout(10) << "read_log done" << dendl; } diff --git a/src/osd/PG.h b/src/osd/PG.h index 13d529f4d6ebb..5e1d0c3954fdf 100644 --- a/src/osd/PG.h +++ b/src/osd/PG.h @@ -281,6 +281,19 @@ public: uint64_t zero_to; // first non-zeroed byte of log. bool has_checksums; + /** + * We reconstruct the missing set by comparing the recorded log against + * the objects in the pg collection. Unfortunately, it's possible to + * have an object in the missing set which is not in the log due to + * a divergent operation with a prior_version pointing before the + * pg log tail. To deal with this, we store alongside the log a mapping + * of divergent priors to be checked along with the log during read_state. + */ + map divergent_priors; + void add_divergent_prior(eversion_t version, hobject_t obj) { + divergent_priors.insert(make_pair(version, obj)); + } + OndiskLog() : tail(0), head(0), zero_to(0), has_checksums(true) {} @@ -294,10 +307,11 @@ public: } void encode(bufferlist& bl) const { - ENCODE_START(4, 3, bl); + ENCODE_START(5, 3, bl); ::encode(tail, bl); ::encode(head, bl); ::encode(zero_to, bl); + ::encode(divergent_priors, bl); ENCODE_FINISH(bl); } void decode(bufferlist::iterator& bl) { @@ -309,6 +323,8 @@ public: ::decode(zero_to, bl); else zero_to = 0; + if (struct_v >= 5) + ::decode(divergent_priors, bl); DECODE_FINISH(bl); } void dump(Formatter *f) const { -- 2.39.5