From da9aed99c70c3a4f9be36654990d5192173077da Mon Sep 17 00:00:00 2001 From: Samuel Just Date: Wed, 31 Oct 2012 15:38:35 -0700 Subject: [PATCH] osd/: add pg_log_entry_t::reverting_to for LOST_REVERT Previously, we encoded the version to which we were reverting in the prior_version field. Now, we explicitely encode that version in the reverting_to field. Using prior_version to encode the reverting_to version could cause us to revert to the wrong version: primary osd.1 writes foo 7'6(0'0) primary osd.1 writes foo 9'9(7'6) primary osd.0 learns the log up to 9'9 but recovers no objects primary osd.1 dies primary osd.0 reverts the foo in version 17'11(7'6) to 7'6 primary osd.1 comes back and starts to recover itself foo is not missing on osd.1, and so the new log entry 17'11(7'6) causes osd.1's missing set to contain an entry for foo with need=17'11 and have=7'6. recover_primary uses this information to conclude that we can locally recover the LOST_REVERT event from the local copy which it assumes is 7'6 but in fact is 9'9. This bug actually manifested as failing an assert in populate_obc_watchers since the version on disk didn't match the prior_version of the log event. Signed-off-by: Samuel Just --- src/osd/ReplicatedPG.cc | 33 +++++++++++++++++++++++++-------- src/osd/osd_types.cc | 8 +++++++- src/osd/osd_types.h | 2 +- 3 files changed, 33 insertions(+), 10 deletions(-) diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc index 5763c43d98577..9ca4546f97432 100644 --- a/src/osd/ReplicatedPG.cc +++ b/src/osd/ReplicatedPG.cc @@ -3970,7 +3970,8 @@ void ReplicatedPG::populate_obc_watchers(ObjectContext *obc) assert(is_active()); assert(!is_missing_object(obc->obs.oi.soid) || (log.objects.count(obc->obs.oi.soid) && // or this is a revert... see recover_primary() - log.objects[obc->obs.oi.soid]->prior_version == obc->obs.oi.version)); + log.objects[obc->obs.oi.soid]->op == pg_log_entry_t::LOST_REVERT && + log.objects[obc->obs.oi.soid]->reverting_to == obc->obs.oi.version)); dout(10) << "populate_obc_watchers " << obc->obs.oi.soid << dendl; if (!obc->obs.oi.watchers.empty()) { @@ -4767,6 +4768,18 @@ int ReplicatedPG::pull(const hobject_t& soid, eversion_t v) << " but it is unfound" << dendl; return PULL_NONE; } + + assert(peer_missing.count(fromosd)); + if (peer_missing[fromosd].is_missing(soid, v)) { + assert(peer_missing[fromosd].missing[soid].have != v); + dout(10) << "pulling soid " << soid << " from osd " << fromosd + << " at version " << peer_missing[fromosd].missing[soid].have + << " rather than at version " << v << dendl; + v = peer_missing[fromosd].missing[soid].have; + assert(log.objects.count(soid) && + log.objects[soid]->op == pg_log_entry_t::LOST_REVERT && + log.objects[soid]->reverting_to == v); + } dout(7) << "pull " << soid << " v " << v @@ -5060,7 +5073,7 @@ void ReplicatedPG::submit_push_complete(ObjectRecoveryInfo &recovery_info, assert(is_primary()); pg_log_entry_t *latest = log.objects[recovery_info.soid]; if (latest->op == pg_log_entry_t::LOST_REVERT && - latest->prior_version == recovery_info.version) { + latest->reverting_to == recovery_info.version) { dout(10) << " got old revert version " << recovery_info.version << " for " << *latest << dendl; recovery_info.version = latest->version; @@ -5837,7 +5850,10 @@ void ReplicatedPG::mark_all_unfound_lost(int what) if (prev > eversion_t()) { // log it ++info.last_update.version; - pg_log_entry_t e(pg_log_entry_t::LOST_REVERT, oid, info.last_update, prev, osd_reqid_t(), mtime); + pg_log_entry_t e( + pg_log_entry_t::LOST_REVERT, oid, info.last_update, + m->second.need, osd_reqid_t(), mtime); + e.reverting_to = prev; log.add(e); dout(10) << e << dendl; @@ -6271,7 +6287,7 @@ int ReplicatedPG::recover_primary(int max) case pg_log_entry_t::LOST_REVERT: { - if (item.have == latest->prior_version) { + if (item.have == latest->reverting_to) { // I have it locally. Revert. object_locator_t oloc; oloc.pool = info.pgid.pool(); @@ -6315,16 +6331,17 @@ int ReplicatedPG::recover_primary(int max) * - this way we don't need to mangle the missing code to be general about needing an old * version... */ - need = latest->prior_version; - dout(10) << " need to pull prior_version " << need << " for revert " << item << dendl; + eversion_t alternate_need = latest->reverting_to; + dout(10) << " need to pull prior_version " << alternate_need << " for revert " << item << dendl; set& loc = missing_loc[soid]; for (map::iterator p = peer_missing.begin(); p != peer_missing.end(); ++p) - if (p->second.missing[soid].have == need) { + if (p->second.is_missing(soid, need) && + p->second.missing[soid].have == alternate_need) { missing_loc_sources.insert(p->first); loc.insert(p->first); } - dout(10) << " will pull " << need << " from one of " << loc << dendl; + dout(10) << " will pull " << alternate_need << " or " << need << " from one of " << loc << dendl; unfound = false; } } diff --git a/src/osd/osd_types.cc b/src/osd/osd_types.cc index b37d8a174f92c..e8a0c7126f318 100644 --- a/src/osd/osd_types.cc +++ b/src/osd/osd_types.cc @@ -1579,7 +1579,7 @@ void pg_query_t::generate_test_instances(list& o) void pg_log_entry_t::encode(bufferlist &bl) const { - ENCODE_START(5, 4, bl); + ENCODE_START(6, 4, bl); ::encode(op, bl); ::encode(soid, bl); ::encode(version, bl); @@ -1588,6 +1588,7 @@ void pg_log_entry_t::encode(bufferlist &bl) const ::encode(mtime, bl); if (op == CLONE) ::encode(snaps, bl); + ::encode(reverting_to, bl); ENCODE_FINISH(bl); } @@ -1614,6 +1615,11 @@ void pg_log_entry_t::decode(bufferlist::iterator &bl) ::decode(snaps, bl); if (struct_v < 5) invalid_pool = true; + if (struct_v >= 6) { + ::decode(reverting_to, bl); + } else { + reverting_to = prior_version; + } DECODE_FINISH(bl); } diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h index e8962336776b8..b9781dd427f7e 100644 --- a/src/osd/osd_types.h +++ b/src/osd/osd_types.h @@ -1248,7 +1248,7 @@ struct pg_log_entry_t { __s32 op; hobject_t soid; - eversion_t version, prior_version; + eversion_t version, prior_version, reverting_to; osd_reqid_t reqid; // caller+tid to uniquely identify request utime_t mtime; // this is the _user_ mtime, mind you bufferlist snaps; // only for clone entries -- 2.39.5