From 11f5e48c57b17c88d552c9de28b6616562c81987 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Fri, 8 Aug 2008 13:34:39 -0700 Subject: [PATCH] osd: guard rep_ops based on map epoch. fix DOWN state. prefer prior_osds for newest_update_osd --- src/osd/OSD.cc | 1 + src/osd/PG.cc | 10 ++++-- src/osd/PG.h | 10 +++--- src/osd/ReplicatedPG.cc | 75 ++++++++++++++++++++++++----------------- 4 files changed, 59 insertions(+), 37 deletions(-) diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index 47b3584307d74..0878c1dde7567 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -1704,6 +1704,7 @@ void OSD::advance_map(ObjectStore::Transaction& t) // deactivate. pg->state_clear(PG_STATE_ACTIVE); + pg->state_clear(PG_STATE_DOWN); // reset primary state? if (oldrole == 0 || pg->get_role() == 0) diff --git a/src/osd/PG.cc b/src/osd/PG.cc index 5decfcb44f0ea..2ef8ac3d8db96 100644 --- a/src/osd/PG.cc +++ b/src/osd/PG.cc @@ -676,7 +676,8 @@ void PG::build_prior() } } - if (!any_up) { + if (info.history.last_epoch_started < info.history.same_since && + !any_up) { dout(10) << "build_prior no osds are up from the last epoch started, PG is down for now." << dendl; state_set(PG_STATE_DOWN); } @@ -763,7 +764,9 @@ void PG::peer(ObjectStore::Transaction& t, for (map::iterator it = peer_info.begin(); it != peer_info.end(); it++) { - if (it->second.last_update > newest_update) { + if (it->second.last_update > newest_update || + (it->second.last_update == newest_update && // prefer osds in the prior set + prior_set.count(newest_update_osd) == 0)) { newest_update = it->second.last_update; newest_update_osd = it->first; } @@ -774,6 +777,8 @@ void PG::peer(ObjectStore::Transaction& t, peers_complete_thru = it->second.last_complete; } } + if (newest_update == info.last_update) // or just me, if nobody better. + newest_update_osd = osd->whoami; // gather log(+missing) from that person! if (newest_update_osd != osd->whoami) { @@ -943,6 +948,7 @@ void PG::activate(ObjectStore::Transaction& t, // twiddle pg state state_set(PG_STATE_ACTIVE); state_clear(PG_STATE_STRAY); + state_clear(PG_STATE_DOWN); if (is_crashed()) { //assert(is_replay()); // HELP.. not on replica? state_clear(PG_STATE_CRASHED); diff --git a/src/osd/PG.h b/src/osd/PG.h index c8d1fda99d5e5..37b7097f15e92 100644 --- a/src/osd/PG.h +++ b/src/osd/PG.h @@ -645,6 +645,7 @@ public: void set_role(int r) { role = r; } bool is_primary() const { return role == PG_ROLE_HEAD; } + bool is_replica() const { return role > 0; } bool is_acker() const { if (g_conf.osd_rep == OSD_REP_PRIMARY) return is_primary(); @@ -662,6 +663,7 @@ public: bool is_complete() const { return info.last_complete == info.last_update; } + int get_state() const { return state; } bool is_active() const { return state_test(PG_STATE_ACTIVE); } bool is_crashed() const { return state_test(PG_STATE_CRASHED); } bool is_down() const { return state_test(PG_STATE_DOWN); } @@ -794,11 +796,9 @@ inline ostream& operator<<(ostream& out, const PG& pg) out << " pct " << pg.peers_complete_thru; if (!pg.have_master_log) out << " !hml"; } - if (pg.is_active()) out << " active"; - if (pg.is_crashed()) out << " crashed"; - if (pg.is_replay()) out << " replay"; - if (pg.is_clean()) out << " clean"; - if (pg.is_stray()) out << " stray"; + + out << " " << pg_state_string(pg.get_state()); + //out << " (" << pg.log.bottom << "," << pg.log.top << "]"; if (pg.missing.num_missing()) out << " m=" << pg.missing.num_missing(); if (pg.missing.num_lost()) out << " l=" << pg.missing.num_lost(); diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc index d54927ec618ec..69e636fa7b156 100644 --- a/src/osd/ReplicatedPG.cc +++ b/src/osd/ReplicatedPG.cc @@ -1285,6 +1285,20 @@ void ReplicatedPG::sub_op_modify(MOSDSubOp *op) << " v " << nv << " " << op->get_offset() << "~" << op->get_length() << dendl; + + // sanity checks + if (op->get_map_epoch() < info.history.same_primary_since) { + dout(10) << "sub_op_modify discarding old sub_op from " + << op->get_map_epoch() << " < " << info.history.same_primary_since << dendl; + delete op; + return; + } + if (!is_active()) { + dout(10) << "sub_op_modify not active" << dendl; + delete op; + return; + } + assert(is_replica()); // note peer's stat int fromosd = op->get_source().num(); @@ -1478,37 +1492,20 @@ void ReplicatedPG::sub_op_pull(MOSDSubOp *op) { const pobject_t poid = op->get_poid(); const eversion_t v = op->get_version(); - int from = op->get_source().num(); dout(7) << "op_pull " << poid << " v " << op->get_version() << " from " << op->get_source() << dendl; - // is a replica asking? are they missing it? - if (is_primary()) { - // primary - assert(peer_missing.count(from)); // we had better know this, from the peering process. + if (op->get_map_epoch() < info.history.same_primary_since) { + dout(10) << "sub_op_pull discarding old sub_op from " + << op->get_map_epoch() << " < " << info.history.same_primary_since << dendl; + delete op; + return; + } - if (!peer_missing[from].is_missing(poid.oid)) { - dout(7) << "op_pull replica isn't actually missing it, we must have already pushed to them" << dendl; - delete op; - return; - } + assert(!is_primary()); // we should be a replica or stray. - // do we have it yet? - if (is_missing_object(poid.oid)) { - wait_for_missing_object(poid.oid, op); - return; - } - } else { - // non-primary - if (missing.is_missing(poid.oid)) { - dout(7) << "op_pull not primary, and missing " << poid << ", ignoring" << dendl; - delete op; - return; - } - } - // push it back! push(poid, op->get_source().num()); } @@ -1522,6 +1519,30 @@ void ReplicatedPG::sub_op_push(MOSDSubOp *op) pobject_t poid = op->get_poid(); eversion_t v = op->get_version(); + dout(7) << "op_push " + << poid + << " v " << v + << " size " << op->get_length() << " " << op->get_data().length() + << dendl; + + if (is_replica()) { + // replica should only accept pushes from the current primary. + if (op->get_map_epoch() < info.history.same_primary_since) { + dout(10) << "sub_op_push discarding old sub_op from " + << op->get_map_epoch() << " < " << info.history.same_primary_since << dendl; + delete op; + return; + } + // FIXME: actually, no, what i really want here is a personal "same_role_since" + if (!is_active()) { + dout(10) << "sub_op_push not active" << dendl; + delete op; + return; + } + } else { + // primary will accept pushes anytime. + } + // are we missing (this specific version)? // (if version is wrong, it is either old (we don't want it) or // newer (peering is buggy)) @@ -1531,12 +1552,6 @@ void ReplicatedPG::sub_op_push(MOSDSubOp *op) return; } - dout(7) << "op_push " - << poid - << " v " << v - << " size " << op->get_length() << " " << op->get_data().length() - << dendl; - assert(op->get_data().length() == op->get_length()); // write object and add it to the PG -- 2.39.5