class ElectionOwner {
public:
+ /**
+ * Write down the given epoch in persistent storage, such that it
+ * can later be retrieved by read_persisted_epoch even across process
+ * or machine restarts.
+ *
+ * @param e The epoch to write
+ */
virtual void persist_epoch(epoch_t e) = 0;
+ /**
+ * Retrieve the most-previously-persisted epoch.
+ *
+ * @returns The latest epoch passed to persist_epoch()
+ */
virtual epoch_t read_persisted_epoch() = 0;
+ /**
+ * Validate that the persistent store is working by committing
+ * to it. (There is no interface for retrieving the value; this
+ * tests local functionality before doing things like triggering
+ * elections to try and join a quorum.)
+ */
virtual void validate_store() = 0;
+ /**
+ * Notify the ElectionOwner that ElectionLogic has increased its
+ * election epoch. This resets an election (either on local loss or victory,
+ * or when trying a new election round) and the ElectionOwner
+ * should reset any tracking of its own to match. (The ElectionLogic
+ * will further trigger sending election messages if that is
+ * appropriate.)
+ */
virtual void notify_bump_epoch() = 0;
+ /**
+ * Notify the ElectionOwner we must start a new election.
+ */
virtual void trigger_new_election() = 0;
+ /**
+ * Retrieve this Paxos instance's rank.
+ */
virtual int get_my_rank() = 0;
+ /**
+ * Send a PROPOSE message to all our peers. This happens when
+ * we have started a new election (which may mean attempting to
+ * override a current one).
+ *
+ * @param e The election epoch of our proposal.
+ */
virtual void propose_to_peers(epoch_t e) = 0;
+ /**
+ * The election has failed and we aren't sure what the state of the
+ * quorum is, so reset the entire system as if from scratch.
+ */
virtual void reset_election() = 0;
+ /**
+ * Ask the ElectionOwner if we-the-Monitor have ever participated in the
+ * quorum (including across process restarts!).
+ *
+ * @returns true if we have participated, false otherwise
+ */
virtual bool ever_participated() = 0;
+ /**
+ * Ask the ElectionOwner for the size of the Paxos set. This includes
+ * those monitors which may not be in the current quorum!
+ */
virtual unsigned paxos_size() = 0;
+ /**
+ * Tell the ElectionOwner we have started a new election.
+ *
+ * The ElectionOwner is responsible for timing out the election (by invoking
+ * end_election_period()) if it takes too long (as defined by the ElectionOwner).
+ * This function is the opportunity to do that and to clean up any other external
+ * election state it may be maintaining.
+ */
virtual void _start() = 0;
+ /**
+ * Tell the ElectionOwner to defer to the identified peer. Tell that peer
+ * we have deferred to it.
+ *
+ * @post we sent an ack message to @p who
+ */
virtual void _defer_to(int who) = 0;
+ /**
+ * We have won an election, so have the ElectionOwner message that to
+ * our new quorum!
+ *
+ * @param quorum The ranks of our peers which deferred to us and
+ * must be told of our victory
+ */
virtual void message_victory(const set<int>& quorum) = 0;
- virtual bool is_current_member(int ) = 0;
+ /**
+ * Query the ElectionOwner about if a given rank is in the
+ * currently active quorum.
+ * @param rank the Paxos rank whose status we are checking
+ * @returns true if the rank is in our current quorum, false otherwise.
+ */
+ virtual bool is_current_member(int rank) = 0;
virtual ~ElectionOwner() {}
};
+/**
+ * This class maintains local state for running an election
+ * between Paxos instances. It receives input requests
+ * and calls back out to its ElectionOwner to do persistence
+ * and message other entities.
+ */
+
class ElectionLogic {
ElectionOwner *elector;
CephContext *cct;
+ /**
+ * Latest epoch we've seen.
+ *
+ * @remarks if its value is odd, we're electing; if it's even, then we're
+ * stable.
+ */
epoch_t epoch = 0;
+ /**
+ * Indicates who we have acked
+ */
int leader_acked;
public:
+ /**
+ * Indicates if we are participating in the quorum.
+ *
+ * @remarks By default, we are created as participating. We may stop
+ * participating if something explicitly sets our value
+ * false, though. If that happens, it will
+ * have to set participating=true and invoke start() for us to resume
+ * participating in the quorum.
+ */
bool participating;
+ /**
+ * Indicates if we are the ones being elected.
+ *
+ * We always attempt to be the one being elected if we are the ones starting
+ * the election. If we are not the ones that started it, we will only attempt
+ * to be elected if we think we might have a chance (i.e., the other guy's
+ * rank is lower than ours).
+ */
bool electing_me;
+ /**
+ * Set containing all those that acked our proposal to become the Leader.
+ *
+ * If we are acked by ElectionOwner::paxos_size() peers, we will declare
+ * victory.
+ */
set<int> acked_me;
ElectionLogic(ElectionOwner *e, CephContext *c) : elector(e), cct(c),
leader_acked(-1),
participating(true),
electing_me(false) {}
+ /**
+ * If there are no other peers in this Paxos group, ElectionOwner
+ * can simply declare victory and we will make it so.
+ *
+ * @pre paxos_size() is 1
+ * @pre get_my_rank is 0
+ */
void declare_standalone_victory();
+ /**
+ * Start a new election by proposing ourselves as the new Leader.
+ *
+ * Basically, send propose messages to all the peers.
+ *
+ * @pre participating is true
+ * @post epoch is an odd value
+ * @post electing_me is true
+ * @post We have invoked propose_to_peers() on our ElectionOwner
+ * @post We have invoked _start() on our ElectionOwner
+ */
void start();
+ /**
+ * ElectionOwner has decided the election has taken too long and expired.
+ *
+ * This will happen when no one declared victory or started a new election
+ * during the allowed time span.
+ *
+ * When the election expires, we will check if we were the ones who won, and
+ * if so we will declare victory. If that is not the case, then we assume
+ * that the one we deferred to didn't declare victory quickly enough (in fact,
+ * as far as we know, it may even be dead); so, just propose ourselves as the
+ * Leader.
+ */
void end_election_period();
+ /**
+ * Handle a proposal from some other node proposing asking to become
+ * the Leader.
+ *
+ * If the message appears to be old (i.e., its epoch is lower than our epoch),
+ * then we may take one of two actions:
+ *
+ * @li Ignore it because it's nothing more than an old proposal
+ * @li Start new elections if we verify that it was sent by a monitor from
+ * outside the quorum; given its old state, it's fair to assume it just
+ * started, so we should start new elections so it may rejoin
+ *
+ * If we did not ignore the received message, then we know that this message
+ * was sent by some other node proposing itself to become the Leader. So, we
+ * will take one of the following actions:
+ *
+ * @li Ignore it because we already acked another node with higher rank
+ * @li Ignore it and start a new election because we outrank it
+ * @li Defer to it because it outranks us and the node we previously
+ * acked, if any
+ *
+ * @pre Message epoch is from the current or a newer epoch
+ * @param mepoch The epoch of the proposal
+ * @param from The rank proposing itself as leader
+ */
void receive_propose(epoch_t mepoch, int from);
+ /**
+ * Handle a message from some other participant Acking us as the Leader.
+ *
+ * When we receive such a message, one of three thing may be happening:
+ * @li We received a message with a newer epoch, which means we must have
+ * somehow lost track of what was going on (maybe we rebooted), thus we
+ * will start a new election
+ * @li We consider ourselves in the run for the Leader (i.e., @p electing_me
+ * is true), and we are actually being Acked by someone; thus simply add
+ * the one acking us to the @p acked_me set. If we do now have acks from
+ * all the participants, then we can declare victory
+ * @li We already deferred the election to somebody else, so we will just
+ * ignore this message
+ *
+ * @pre Message epoch is from the current or a newer epoch
+ * @post Election is on-going if we deferred to somebody else
+ * @post Election is on-going if we are still waiting for further Acks
+ * @post Election is not on-going if we are victorious
+ * @post Election is not on-going if we must start a new one
+ *
+ * @param from The rank which acked us
+ * @param from_epoch The election epoch the ack belongs to
+ */
void receive_ack(int from, epoch_t from_epoch);
+ /**
+ * Handle a message from some other participant declaring Victory.
+ *
+ * We just got a message from someone declaring themselves Victorious, thus
+ * the new Leader.
+ *
+ * However, if the message's epoch happens to be different from our epoch+1,
+ * then it means we lost track of something and we must start a new election.
+ *
+ * If that is not the case, then we will simply update our epoch to the one
+ * in the message and invoke start() to reset the quorum.
+ *
+ * @pre from_epoch is the current or a newer epoch
+ * @post Election is not on-going
+ * @post Updated @p epoch
+ * @post We are a peon in a new quorum if we lost the election
+ *
+ * @param from The victory-claiming rank
+ * @param from_epoch The election epoch in which they claim victory
+ */
bool receive_victory_claim(int from, epoch_t from_epoch);
+ /**
+ * Obtain our epoch
+ *
+ * @returns Our current epoch number
+ */
epoch_t get_epoch() { return epoch; }
private:
+ /**
+ * Initiate the ElectionLogic class.
+ *
+ * Basically, we will simply read whatever epoch value we have in our stable
+ * storage, or consider it to be 1 if none is read.
+ *
+ * @post @p epoch is set to 1 or higher.
+ */
void init();
+ /**
+ * Update our epoch.
+ *
+ * If we come across a higher epoch, we simply update ours, also making
+ * sure we are no longer being elected (even though we could have been,
+ * we no longer are since we no longer are on that old epoch).
+ *
+ * @pre Our epoch is not larger than @p e
+ * @post Our epoch equals @p e
+ *
+ * @param e Epoch to which we will update our epoch
+ */
void bump_epoch(epoch_t e);
+ /**
+ * Defer the current election to some other monitor.
+ *
+ * This means that we will ack some other monitor and drop out from the run
+ * to become the Leader. We will only defer an election if the monitor we
+ * are deferring to outranks us.
+ *
+ * @pre @p who outranks us (i.e., who < our rank)
+ * @pre @p who outranks any other monitor we have deferred to in the past
+ * @post electing_me is false
+ * @post leader_acked equals @p who
+ * @post we triggered ElectionOwner's _defer_to() on @p who
+ *
+ * @param who Some other monitor's numeric identifier.
+ */
void defer(int who);
+ /**
+ * Declare Victory.
+ *
+ * We won. Or at least we believe we won, but for all intents and purposes
+ * that does not matter. What matters is that we Won.
+ *
+ * That said, we must now bump our epoch to reflect that the election is over
+ * and then we must let everybody in the quorum know we are their brand new
+ * Leader.
+ *
+ * Actually, the quorum will be now defined as the group of monitors that
+ * acked us during the election process.
+ *
+ * @pre Election is on-going
+ * @pre electing_me is true
+ * @post electing_me is false
+ * @post epoch is bumped up into an even value
+ * @post Election is not on-going
+ * @post We have a quorum, composed of the monitors that acked us
+ * @post We invoked message_victory() on the ElectionOwner
+ */
void declare_victory();
};
class Monitor;
/**
- * This class is responsible for maintaining the local state when electing
+ * This class is responsible for handling messages and maintaining
+ * an ElectionLogic which holds the local state when electing
* a new Leader. We may win or we may lose. If we win, it means we became the
* Leader; if we lose, it means we are a Peon.
*/
*/
void cancel_timer();
- /**
- * Latest epoch we've seen.
- *
- * @remarks if its value is odd, we're electing; if it's even, then we're
- * stable.
- */
- //epoch_t epoch;
-
- /**
- * Indicates if we are participating in the quorum.
- *
- * @remarks By default, we are created as participating. We may stop
- * participating if the Monitor explicitly calls
- * Elector::stop_participating though. If that happens, it will
- * have to call Elector::start_participating for us to resume
- * participating in the quorum.
- */
- // bool participating;
-
// electing me
/**
* @defgroup Elector_h_electing_me_vars We are being elected
* @{
*/
/**
- * Indicates if we are the ones being elected.
- *
- * We always attempt to be the one being elected if we are the ones starting
- * the election. If we are not the ones that started it, we will only attempt
- * to be elected if we think we might have a chance (i.e., the other guy's
- * rank is lower than ours).
- */
- // bool electing_me;
- /**
- * Set containing all those that acked our proposal to become the Leader.
- *
- * If we are acked by everyone in the MonMap, we will declare
- * victory. Also note each peer's feature set.
+ * Map containing info of all those that acked our proposal to become the Leader.
+ * Note each peer's info.
*/
map<int, elector_info_t> peer_info;
- /**
- * @}
- */
- /**
- * @defgroup Elector_h_electing_them_vars We are electing another guy
- * @{
- */
- /**
- * Indicates who we have acked
- */
- // int leader_acked;
/**
* @}
*/
* Handle a message from some other node proposing itself to become it
* the Leader.
*
- * If the message appears to be old (i.e., its epoch is lower than our epoch),
- * then we may take one of two actions:
- *
- * @li Ignore it because it's nothing more than an old proposal
- * @li Start new elections if we verify that it was sent by a monitor from
- * outside the quorum; given its old state, it's fair to assume it just
- * started, so we should start new elections so it may rejoin
- *
- * If we did not ignore the received message, then we know that this message
- * was sent by some other node proposing itself to become the Leader. So, we
- * will take one of the following actions:
- *
- * @li Ignore it because we already acked another node with higher rank
- * @li Ignore it and start a new election because we outrank it
- * @li Defer to it because it outranks us and the node we previously
- * acked, if any
- *
+ * We validate that the sending Monitor is allowed to participate based on
+ * its supported features, then pass the request to our ElectionLogic.
*
* @invariant The received message is an operation of type OP_PROPOSE
*
+ * @pre Message epoch is from the current or a newer epoch
+ *
* @param m A message sent by another participant in the quorum.
*/
void handle_propose(MonOpRequestRef op);
/**
* Handle a message from some other participant Acking us as the Leader.
*
- * When we receive such a message, one of three thing may be happening:
- * @li We received a message with a newer epoch, which means we must have
- * somehow lost track of what was going on (maybe we rebooted), thus we
- * will start a new election
- * @li We consider ourselves in the run for the Leader (i.e., @p electing_me
- * is true), and we are actually being Acked by someone; thus simply add
- * the one acking us to the @p acked_me set. If we do now have acks from
- * all the participants, then we can declare victory
- * @li We already deferred the election to somebody else, so we will just
- * ignore this message
- *
- * @pre Election is on-going
- * @post Election is on-going if we deferred to somebody else
- * @post Election is on-going if we are still waiting for further Acks
- * @post Election is not on-going if we are victorious
- * @post Election is not on-going if we must start a new one
+ * We validate that the sending Monitor is allowed to participate based on
+ * its supported features, add it to peer_info, and pass the ack to our
+ * ElectionLogic.
+ *
+ * @pre Message epoch is from the current or a newer epoch
*
* @param m A message with an operation type of OP_ACK
*/
* We just got a message from someone declaring themselves Victorious, thus
* the new Leader.
*
- * However, if the message's epoch happens to be different from our epoch+1,
- * then it means we lost track of something and we must start a new election.
- *
- * If that is not the case, then we will simply update our epoch to the one
- * in the message, cancel our @p expire_event timer and inform our Monitor
- * that we lost the election and provide it with the new quorum.
+ * We pass the Victory to our ElectionLogic, and if it confirms the
+ * victory we lose the election and start following this Leader. Otherwise,
+ * drop the message.
*
- * @pre Election in on-going
+ * @pre Message epoch is from the current or a newer epoch
* @post Election is not on-going
* @post Updated @p epoch
* @post We have a new quorum if we lost the election
public:
/**
- * Update our epoch.
- *
- * If we come across a higher epoch, we simply update ours, also making
- * sure we are no longer being elected (even though we could have been,
- * we no longer are since we no longer are on that old epoch).
- *
- * @pre Our epoch is lower than @p e
- * @post Our epoch equals @p e
- *
- * @param e Epoch to which we will update our epoch
+ * @defgroup Elector_h_ElectionOwner Functions from the ElectionOwner interface
+ * @{
*/
+ /* Commit the given epoch to our MonStore */
+ void persist_epoch(epoch_t e);
+ /* Read the epoch out of our MonStore */
+ epoch_t read_persisted_epoch();
+ /* Write a nonsense key "election_writeable_test" to our MonStore */
+ void validate_store();
+ /* Reset my tracking. Currently, just call Monitor::join_election() */
void notify_bump_epoch();
-
+ /* Call a new election: Invoke Monitor::start_election() */
+ void trigger_new_election();
+ /* Retrieve rank from the Monitor */
+ int get_my_rank();
+ /* Send MMonElection OP_PROPOSE to every monitor in the map. */
+ void propose_to_peers(epoch_t e);
+ /* bootstrap() the Monitor */
+ void reset_election();
+ /* Retrieve the Monitor::has_ever_joined member */
+ bool ever_participated();
+ /* Retrieve monmap->size() */
+ unsigned paxos_size();
/**
- * Start new elections by proposing ourselves as the new Leader.
+ * Reset the expire_event timer so we can limit the amount of time we
+ * will be electing. Clean up our peer_info.
*
- * Basically, send propose messages to all the monitors in the MonMap and
- * then reset the expire_event timer so we can limit the amount of time we
- * will be going at it.
- *
- * @pre participating is true
- * @post epoch is an odd value
- * @post electing_me is true
- * @post we sent propose messages to all the monitors in the MonMap
* @post we reset the expire_event timer
*/
void _start();
/**
- * Defer the current election to some other monitor.
- *
- * This means that we will ack some other monitor and drop out from the run
- * to become the Leader. We will only defer an election if the monitor we
- * are deferring to outranks us.
+ * Send an MMonElection message deferring to the identified monitor. We
+ * also increase the election timeout so the monitor we defer to
+ * has some time to gather deferrals and actually win. (FIXME: necessary to protocol?)
*
- * @pre @p who outranks us (i.e., who < our rank)
- * @pre @p who outranks any other monitor we have deferred to in the past
- * @post electing_me is false
- * @post leader_acked equals @p who
* @post we sent an ack message to @p who
* @post we reset the expire_event timer
*
*/
void _defer_to(int who);
/**
- * The election has taken too long and has expired.
- *
- * This will happen when no one declared victory or started a new election
- * during the time span allowed by the expire_event timer.
- *
- * When the election expires, we will check if we were the ones who won, and
- * if so we will declare victory. If that is not the case, then we assume
- * that the one we deferred to didn't declare victory quickly enough (in fact,
- * as far as we know, we may even be dead); so, just propose ourselves as the
- * Leader.
- */
- // void expire();
- /**
- * Declare Victory.
- *
- * We won. Or at least we believe we won, but for all intentions and purposes
- * that does not matter. What matters is that we Won.
- *
- * That said, we must now bump our epoch to reflect that the election is over
- * and then we must let everybody in the quorum know we are their brand new
- * Leader. And we will also cancel our expire_event timer.
- *
- * Actually, the quorum will be now defined as the group of monitors that
- * acked us during the election process.
- *
- * @pre Election is on-going
- * @pre electing_me is true
- * @post electing_me is false
- * @post epoch is bumped up into an even value
- * @post Election is not on-going
- * @post We have a quorum, composed of the monitors that acked us
- * @post We sent a message of type OP_VICTORY to each quorum member.
+ * Our ElectionLogic told us we won an election! Identify the quorum
+ * features, tell our new peons we've won, and invoke Monitor::win_election().
*/
void message_victory(const set<int>& quorum);
-
- void persist_epoch(epoch_t e);
- epoch_t read_persisted_epoch();
- void validate_store();
- void trigger_new_election();
- int get_my_rank();
- void propose_to_peers(epoch_t e);
- void reset_election();
- bool ever_participated();
- unsigned paxos_size();
+ /* Check if rank is in mon->quorum */
+ bool is_current_member(int rank);
+ /*
+ * @}
+ */
Elector *elector;
- bool is_current_member(int rank);
/**
* Create an Elector class
explicit Elector(Monitor *m);
virtual ~Elector() {}
- /**
- * Initiate the Elector class.
- *
- * Basically, we will simply read whatever epoch value we have in our stable
- * storage, or consider it to be 1 if none is read.
- *
- * @post @p epoch is set to 1 or higher.
- */
- // void init();
/**
* Inform this class it is supposed to shutdown.
*
void shutdown();
/**
- * Obtain our epoch
+ * Obtain our epoch from ElectionLogic.
*
* @returns Our current epoch number
*/
epoch_t get_epoch() { return logic.get_epoch(); }
+ /**
+ * If the Monitor knows there are no Paxos peers (so
+ * we are rank 0 and there are no others) we can declare victory.
+ */
void declare_standalone_victory() {
logic.declare_standalone_victory();
}
/**
* Call an election.
*
- * This function simply calls Elector::start.
+ * This function simply calls ElectionLogic::start.
*/
void call_election() {
logic.start();