From 615689a9198aed76292c563fee1bf08e3515aaef Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 13 Oct 2011 13:03:09 -0700 Subject: [PATCH] osd: implement lost_revert Roll back to the last available version of an object. If there is no available version, delete it. Leave the door open for other approaches later. Currently this only works if the prior version is on the primary. If it is on another node, we don't pull it yet. Signed-off-by: Sage Weil --- src/osd/OSD.cc | 24 +++++- src/osd/PG.h | 2 +- src/osd/ReplicatedPG.cc | 168 ++++++++++++++++++++++++++++++++++------ src/osd/ReplicatedPG.h | 5 +- 4 files changed, 169 insertions(+), 30 deletions(-) diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index 15d97ef34f587..67f043ba59af0 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -2357,14 +2357,29 @@ void OSD::do_command(Connection *con, tid_t tid, vector& cmd, bufferlist flush_pg_stats(); } - else if (cmd.size() == 2 && cmd[0] == "mark_unfound_lost") { + else if (cmd.size() == 3 && cmd[0] == "mark_unfound_lost") { pg_t pgid; if (!pgid.parse(cmd[1].c_str())) { ss << "can't parse pgid '" << cmd[1] << "'"; r = -EINVAL; goto out; } - PG *pg = _lookup_lock_pg(pgid); + int mode; + if (cmd[2] == "revert") + mode = PG::Log::Entry::LOST_REVERT; + /* + else if (cmd[2] == "mark") + mode = PG::Log::Entry::LOST_MARK; + else if (cmd[2] == "delete" || cmd[2] == "remove") + mode = PG::Log::Entry::LOST_DELETE; + */ + else { + //ss << "mode must be mark|revert|delete"; + ss << "mode must be revert (mark|delete not yet implemented)"; + r = -EINVAL; + goto out; + } + PG *pg = _have_pg(pgid) ? _lookup_lock_pg(pgid) : NULL; if (!pg) { ss << "pg " << pgid << " not found"; r = -ENOENT; @@ -2372,16 +2387,19 @@ void OSD::do_command(Connection *con, tid_t tid, vector& cmd, bufferlist } if (!pg->is_primary()) { ss << "pg " << pgid << " not primary"; + pg->unlock(); r = -EINVAL; goto out; } int unfound = pg->missing.num_missing() - pg->missing_loc.size(); if (!unfound) { ss << "pg " << pgid << " has no unfound objects"; + pg->unlock(); r = -ENOENT; goto out; } if (!pg->all_unfound_are_queried_or_lost(pg->osd->osdmap)) { + pg->unlock(); ss << "pg " << pgid << " has " << unfound << " objects but we haven't probed all sources, not marking lost despite command " << cmd; @@ -2390,7 +2408,7 @@ void OSD::do_command(Connection *con, tid_t tid, vector& cmd, bufferlist } ss << pgid << " has " << unfound << " objects unfound and apparently lost, marking"; - pg->mark_all_unfound_lost(); + pg->mark_all_unfound_lost(mode); pg->unlock(); } diff --git a/src/osd/PG.h b/src/osd/PG.h index 0a06cf7faf3ea..830b8a5f9af7a 100644 --- a/src/osd/PG.h +++ b/src/osd/PG.h @@ -1405,7 +1405,7 @@ public: bool adjust_need_up_thru(const OSDMap *osdmap); bool all_unfound_are_queried_or_lost(const OSDMap* osdmap) const; - virtual void mark_all_unfound_lost() = 0; + virtual void mark_all_unfound_lost(int how) = 0; bool calc_min_last_complete_ondisk() { eversion_t min = last_complete_ondisk; diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc index 124a91fe5cc0d..e165d86ab3569 100644 --- a/src/osd/ReplicatedPG.cc +++ b/src/osd/ReplicatedPG.cc @@ -4445,12 +4445,32 @@ void ReplicatedPG::_failed_push(MOSDSubOp *op) +eversion_t ReplicatedPG::pick_newest_available(const hobject_t& oid) +{ + eversion_t v; + + assert(missing.is_missing(oid)); + v = missing.missing[oid].have; + dout(10) << "pick_newest_available " << oid << " i have " << v << dendl; + + for (unsigned i=1; i v) + v = h; + } + + dout(10) << "pick_newest_available " << oid << " " << v << dendl; + return v; +} + /* Mark an object as lost */ ReplicatedPG::ObjectContext *ReplicatedPG::mark_object_lost(ObjectStore::Transaction *t, - const hobject_t &oid, eversion_t version, - utime_t mtime) + const hobject_t &oid, eversion_t version, + utime_t mtime, int what) { // Wake anyone waiting for this object. Now that it's been marked as lost, // we will just return an error code. @@ -4461,9 +4481,8 @@ ReplicatedPG::ObjectContext *ReplicatedPG::mark_object_lost(ObjectStore::Transac } // Add log entry - info.last_update.version++; - Log::Entry e(Log::Entry::LOST_MARK, oid, info.last_update, - version, osd_reqid_t(), mtime); + ++info.last_update.version; + Log::Entry e(what, oid, info.last_update, version, osd_reqid_t(), mtime); log.add(e); object_locator_t oloc; @@ -4497,9 +4516,9 @@ struct C_PG_MarkUnfoundLost : public Context { /* Mark all unfound objects as lost. */ -void ReplicatedPG::mark_all_unfound_lost() +void ReplicatedPG::mark_all_unfound_lost(int what) { - dout(3) << __func__ << dendl; + dout(3) << __func__ << " " << Log::Entry::get_op_name(what) << dendl; dout(30) << __func__ << ": log before:\n"; log.print(*_dout); @@ -4514,18 +4533,63 @@ void ReplicatedPG::mark_all_unfound_lost() map::iterator m = missing.missing.begin(); map::iterator mend = missing.missing.end(); while (m != mend) { - const hobject_t &soid(m->first); - if (missing_loc.find(soid) != missing_loc.end()) { + const hobject_t &oid(m->first); + if (missing_loc.find(oid) != missing_loc.end()) { // We only care about unfound objects ++m; continue; } - ObjectContext *obc = mark_object_lost(t, soid, m->second.need, mtime); - c->obcs.push_back(obc); + ObjectContext *obc = NULL; + eversion_t prev; + + switch (what) { + case Log::Entry::LOST_MARK: + obc = mark_object_lost(t, oid, m->second.need, mtime, Log::Entry::LOST_MARK); + missing.got(m++); + assert(0 == "actually, not implemented yet!"); + // we need to be careful about how this is handled on the replica! + break; + + case Log::Entry::LOST_REVERT: + prev = pick_newest_available(oid); + if (prev > eversion_t()) { + // log it + ++info.last_update.version; + Log::Entry e(Log::Entry::LOST_REVERT, oid, info.last_update, prev, osd_reqid_t(), mtime); + log.add(e); + dout(10) << e << dendl; + + // we are now missing the new version; recovery code will sort it out. + m++; + missing.revise_need(oid, info.last_update); + break; + } + /** fall-thru **/ + + case Log::Entry::LOST_DELETE: + { + // log it + ++info.last_update.version; + Log::Entry e(Log::Entry::LOST_DELETE, oid, info.last_update, m->second.need, + osd_reqid_t(), mtime); + log.add(e); + dout(10) << e << dendl; + + // delete local copy? NOT YET! FIXME + if (m->second.have != eversion_t()) { + assert(0 == "not implemented.. tho i'm not sure how useful it really would be."); + } + missing.rm(m++); + } + break; - // Remove from (my) missing set - missing.got(m++); + default: + assert(0); + } + + if (obc) + c->obcs.push_back(obc); } dout(30) << __func__ << ": log after:\n"; @@ -4550,7 +4614,7 @@ void ReplicatedPG::mark_all_unfound_lost() void ReplicatedPG::_finish_mark_all_unfound_lost(list& obcs) { - dout(10) << "_finish_mark_all_unfound_lost" << dendl; + dout(10) << "_finish_mark_all_unfound_lost " << dendl; lock(); while (!obcs.empty()) { ObjectContext *obc = obcs.front(); @@ -4716,6 +4780,11 @@ int ReplicatedPG::start_recovery_ops(int max) // We still have missing objects that we should grab from replicas. started += recover_primary(max); } + if (!started && num_unfound != get_num_unfound()) { + // second chance to recovery replicas + started = recover_replicas(max); + } + dout(10) << " started " << started << dendl; osd->logger->inc(l_osd_rop, started); @@ -4776,6 +4845,8 @@ int ReplicatedPG::recover_primary(int max) hobject_t head = soid; head.snap = CEPH_NOSNAP; + eversion_t need = item.need; + bool unfound = (missing_loc.find(soid) == missing_loc.end()); dout(10) << "recover_primary " @@ -4786,15 +4857,12 @@ int ReplicatedPG::recover_primary(int max) << (pulling.count(soid) ? " (pulling)":"") << (pulling.count(head) ? " (pulling head)":"") << dendl; - - if (!pulling.count(soid)) { - if (pulling.count(head)) { - ++skipped; - } else if (unfound) { - ++skipped; - } else { - // is this a clone operation that we can do locally? - if (latest && latest->op == Log::Entry::CLONE) { + + if (latest) { + switch (latest->op) { + case Log::Entry::CLONE: + { + // is this a clone operation that we can do locally? if (missing.is_missing(head) && missing.have_old(head) == latest->prior_version) { dout(10) << "recover_primary cloning " << head << " v" << latest->prior_version @@ -4825,7 +4893,59 @@ int ReplicatedPG::recover_primary(int max) continue; } } - + break; + + case Log::Entry::LOST_REVERT: + { + if (item.have == latest->prior_version) { + // I have it locally. Revert. + object_locator_t oloc; + oloc.pool = info.pgid.pool(); + oloc.key = soid.get_key(); + ObjectContext *obc = get_object_context(soid, oloc, true); + + if (obc->obs.oi.version == latest->version) { + // I'm already reverting + dout(10) << " already reverting " << soid << dendl; + } else { + dout(10) << " reverting " << soid << " to " << latest->prior_version << dendl; + obc->ondisk_write_lock(); + obc->obs.oi.version = latest->version; + + ObjectStore::Transaction *t = new ObjectStore::Transaction; + bufferlist b2; + obc->obs.oi.encode(b2); + t->setattr(coll, soid, OI_ATTR, b2); + + recover_primary_got(soid, latest->version); + + osd->store->queue_transaction(&osr, t, + new C_OSD_AppliedPushedObject(this, t, obc), + new C_OSD_CommittedPushedObject(this, NULL, + info.history.same_interval_since, + info.last_complete), + new C_OSD_OndiskWriteUnlock(obc)); + continue; + } + } else { + need = latest->prior_version; + dout(10) << " pulling prior_version " << need << " for revert " << item << dendl; + + // ... + assert(0 == "pulling prior version for revert not implement yet"); + + } + } + break; + } + } + + if (!pulling.count(soid)) { + if (pulling.count(head)) { + ++skipped; + } else if (unfound) { + ++skipped; + } else { int r = pull(soid, need); switch (r) { case PULL_YES: diff --git a/src/osd/ReplicatedPG.h b/src/osd/ReplicatedPG.h index ac5268c994a23..5864255f66ea2 100644 --- a/src/osd/ReplicatedPG.h +++ b/src/osd/ReplicatedPG.h @@ -816,10 +816,11 @@ public: bool is_degraded_object(const hobject_t& oid); void wait_for_degraded_object(const hobject_t& oid, Message *op); - void mark_all_unfound_lost(); + void mark_all_unfound_lost(int what); + eversion_t pick_newest_available(const hobject_t& oid); ObjectContext *mark_object_lost(ObjectStore::Transaction *t, const hobject_t& oid, eversion_t version, - utime_t mtime); + utime_t mtime, int what); void _finish_mark_all_unfound_lost(list& obcs); void on_osd_failure(int o); -- 2.47.3