\ |
\ v
\-- STOPPING
-
+ new states:
+
+ boot --> standby, creating, or starting.
+
+
+ dne ----> creating -----> active*
+ ^ ^___________/ / ^ ^
+ | / / |
+ destroying / / |
+ ^ / / |
+ | / / |
+ stopped <---- stopping* <-/ / |
+ \ / |
+ ----- starting* ----/ |
+ |
+ failed |
+ \ |
+ \--> replay* --> reconnect* --> rejoin*
+
+ * = can fail
*/
class MDSMap {
public:
// mds states
- static const int STATE_DNE = 0; // down, never existed.
- static const int STATE_STOPPED = -1; // down, once existed, but no subtrees. empty log.
- static const int STATE_FAILED = 2; // down, active subtrees; needs to be recovered.
-
- static const int STATE_BOOT = -3; // up, boot announcement. destiny unknown.
- static const int STATE_STANDBY = -4; // up, idle. waiting for assignment by monitor.
- static const int STATE_CREATING = -5; // up, creating MDS instance (new journal, idalloc..).
- static const int STATE_STARTING = -6; // up, starting prior stopped MDS instance.
-
- static const int STATE_REPLAY = 7; // up, starting prior failed instance. scanning journal.
- static const int STATE_RESOLVE = 8; // up, disambiguating distributed operations (import, rename, etc.)
- static const int STATE_RECONNECT = 9; // up, reconnect to clients
- static const int STATE_REJOIN = 10; // up, replayed journal, rejoining distributed cache
- static const int STATE_ACTIVE = 11; // up, active
- static const int STATE_STOPPING = 12; // up, exporting metadata (-> standby or out)
+ static const int STATE_DNE = 0; // down, never existed.
+ static const int STATE_DESTROYING = -1; // down, existing, semi-destroyed.
+ static const int STATE_STOPPED = -2; // down, once existed, but no subtrees. empty log.
+ static const int STATE_FAILED = 3; // down, active subtrees; needs to be recovered.
+
+ static const int STATE_BOOT = -4; // up, boot announcement. destiny unknown.
+ static const int STATE_STANDBY = -5; // up, idle. waiting for assignment by monitor.
+
+ static const int STATE_CREATING = -6; // up, creating MDS instance (new journal, idalloc..).
+ static const int STATE_STARTING = -7; // up, starting prior stopped MDS instance.
+
+ static const int STATE_REPLAY = 8; // up, starting prior failed instance. scanning journal.
+ static const int STATE_RESOLVE = 9; // up, disambiguating distributed operations (import, rename, etc.)
+ static const int STATE_RECONNECT = 10; // up, reconnect to clients
+ static const int STATE_REJOIN = 11; // up, replayed journal, rejoining distributed cache
+ static const int STATE_ACTIVE = 12; // up, active
+ static const int STATE_STOPPING = 13; // up, exporting metadata (-> standby or out)
static const char *get_state_name(int s) {
switch (s) {
// down and out
- case STATE_DNE: return "down:dne";
- case STATE_STOPPED: return "down:stopped";
+ case STATE_DNE: return "down:dne";
+ case STATE_DESTROYING: return "down:destroying";
+ case STATE_STOPPED: return "down:stopped";
// down and in
- case STATE_FAILED: return "down:failed";
+ case STATE_FAILED: return "down:failed";
// up and out
- case STATE_BOOT: return "up:boot";
- case STATE_CREATING: return "up:creating";
- case STATE_STARTING: return "up:starting";
- case STATE_STANDBY: return "up:standby";
+ case STATE_BOOT: return "up:boot";
+ case STATE_STANDBY: return "up:standby";
+ case STATE_CREATING: return "up:creating";
+ case STATE_STARTING: return "up:starting";
// up and in
- case STATE_REPLAY: return "up:replay";
- case STATE_RESOLVE: return "up:resolve";
- case STATE_RECONNECT: return "up:reconnect";
- case STATE_REJOIN: return "up:rejoin";
- case STATE_ACTIVE: return "up:active";
- case STATE_STOPPING: return "up:stopping";
+ case STATE_REPLAY: return "up:replay";
+ case STATE_RESOLVE: return "up:resolve";
+ case STATE_RECONNECT: return "up:reconnect";
+ case STATE_REJOIN: return "up:rejoin";
+ case STATE_ACTIVE: return "up:active";
+ case STATE_STOPPING: return "up:stopping";
default: assert(0);
}
return 0;
protected:
epoch_t epoch;
+ epoch_t client_epoch; // incremented only when change is significant to client.
utime_t created;
- epoch_t same_in_set_since; // note: this does not reflect exit-by-failure.
- int target_num;
- int anchortable; // which MDS has anchortable (fixme someday)
- int root; // which MDS has root directory
+ int32_t max_mds;
+ int32_t anchortable; // which MDS has anchortable (fixme someday)
+ int32_t root; // which MDS has root directory
+
+ map<int32_t,int32_t> mds_state; // MDS state
+ map<int32_t,version_t> mds_state_seq;
+ map<int32_t,entity_inst_t> mds_inst; // up instances
+ map<int32_t,int32_t> mds_inc; // incarnation count (monotonically increases)
- set<int> mds_created; // which mds ids have initialized journals and id tables.
- map<int,int> mds_state; // MDS state
- map<int,version_t> mds_state_seq;
- map<int,entity_inst_t> mds_inst; // up instances
- map<int,int> mds_inc; // incarnation count (monotonically increases)
+ map<entity_addr_t,int32_t> standby; // -1 == any
+ map<int32_t, set<entity_addr_t> > standby_for;
+ set<entity_addr_t> standby_any;
friend class MDSMonitor;
public:
- MDSMap() : epoch(0), same_in_set_since(0), anchortable(0), root(0) {}
+ MDSMap() : epoch(0), client_epoch(0), anchortable(0), root(0) {}
epoch_t get_epoch() const { return epoch; }
void inc_epoch() { epoch++; }
const utime_t& get_create() const { return created; }
- epoch_t get_same_in_set_since() const { return same_in_set_since; }
int get_anchortable() const { return anchortable; }
int get_root() const { return root; }
bool is_failed(int m) { return mds_state.count(m) && mds_state[m] == STATE_FAILED; }
bool is_boot(int m) { return mds_state.count(m) && mds_state[m] == STATE_BOOT; }
- bool is_standby(int m) { return mds_state.count(m) && mds_state[m] == STATE_STANDBY; }
bool is_creating(int m) { return mds_state.count(m) && mds_state[m] == STATE_CREATING; }
bool is_starting(int m) { return mds_state.count(m) && mds_state[m] == STATE_STARTING; }
bool is_replay(int m) { return mds_state.count(m) && mds_state[m] == STATE_REPLAY; }
bool is_active_or_stopping(int m) { return is_active(m) || is_stopping(m); }
bool is_stopped(int m) { return mds_state.count(m) && mds_state[m] == STATE_STOPPED; }
- bool has_created(int m) { return mds_created.count(m); }
+ bool is_standby(entity_addr_t a) { return standby.count(a); }
// cluster states
bool is_full() {
- return get_num_in_mds() >= target_num;
+ return get_num_in_mds() >= max_mds;
}
bool is_degraded() { // degraded = some recovery in process. fixes active membership and recovery_set.
return
p->second == STATE_CREATING)
in++;
}
- return (in > target_num);
+ return (in > max_mds);
}
int get_state(int m) {
++p) {
if (p->second.addr == addr) return p->first;
}
- /*else
- for (map<int,entity_inst_t>::iterator p = mds_inst.begin();
- p != mds_inst.end();
- ++p) {
- if (memcmp(&p->second.addr,&inst.addr, sizeof(inst.addr)) == 0) return p->first;
- }
- */
-
+ if (standby.count(addr))
+ return -2;
return -1;
}
// serialize, unserialize
void encode(bufferlist& bl) {
::_encode(epoch, bl);
- ::_encode(target_num, bl);
+ ::_encode(client_epoch, bl);
::_encode(created, bl);
- ::_encode(same_in_set_since, bl);
::_encode(anchortable, bl);
::_encode(root, bl);
+ ::_encode(max_mds, bl);
::_encode(mds_state, bl);
::_encode(mds_state_seq, bl);
::_encode(mds_inst, bl);
::_encode(mds_inc, bl);
+ ::_encode(standby, bl);
+ ::_encode(standby_for, bl);
+ ::_encode(standby_any, bl);
}
void decode(bufferlist& bl) {
int off = 0;
::_decode(epoch, bl, off);
- ::_decode(target_num, bl, off);
+ ::_decode(client_epoch, bl, off);
::_decode(created, bl, off);
- ::_decode(same_in_set_since, bl, off);
::_decode(anchortable, bl, off);
::_decode(root, bl, off);
+ ::_decode(max_mds, bl, off);
::_decode(mds_state, bl, off);
::_decode(mds_state_seq, bl, off);
::_decode(mds_inst, bl, off);
::_decode(mds_inc, bl, off);
+ ::_decode(standby, bl, off);
+ ::_decode(standby_for, bl, off);
+ ::_decode(standby_any, bl, off);
}
void MDSMonitor::print_map(MDSMap &m)
{
- dout(7) << "print_map epoch " << m.get_epoch() << " target_num " << m.target_num << dendl;
+ dout(7) << "print_map epoch " << m.get_epoch() << " max " << m.max_mds << dendl;
entity_inst_t blank;
set<int> all;
m.get_mds_set(all);
for (set<int>::iterator p = all.begin();
p != all.end();
++p) {
- dout(7) << " mds" << *p << "." << m.mds_inc[*p]
- << " : " << MDSMap::get_state_name(m.get_state(*p))
- << " : " << (m.have_inst(*p) ? m.get_inst(*p) : blank)
- << dendl;
+ if (m.standby_for.count(*p) && !m.standby_for[*p].empty()) {
+ dout(7) << " mds" << *p << "." << m.mds_inc[*p]
+ << " : " << MDSMap::get_state_name(m.get_state(*p))
+ << " : " << (m.have_inst(*p) ? m.get_inst(*p) : blank)
+ << " : +" << m.standby_for[*p].size()
+ << " standby " << m.standby_for[*p]
+ << dendl;
+ } else {
+ dout(7) << " mds" << *p << "." << m.mds_inc[*p]
+ << " : " << MDSMap::get_state_name(m.get_state(*p))
+ << " : " << (m.have_inst(*p) ? m.get_inst(*p) : blank)
+ << dendl;
+ }
+ }
+ if (!m.standby_any.empty()) {
+ dout(7) << " +" << m.standby_any.size() << " shared standby " << m.standby_any << dendl;
}
}
void MDSMonitor::create_initial()
{
dout(10) << "create_initial" << dendl;
- pending_mdsmap.target_num = g_conf.num_mds;
+ pending_mdsmap.max_mds = g_conf.num_mds;
pending_mdsmap.created = g_clock.now();
print_map(pending_mdsmap);
}
bcast_latest_mds();
send_to_waiting();
+ // make sure last_beacon is populated
+ for (map<int32_t,entity_inst_t>::iterator p = mdsmap.mds_inst.begin();
+ p != mdsmap.mds_inst.end();
+ ++p)
+ if (last_beacon.count(p->second.addr) == 0)
+ last_beacon[p->second.addr] = g_clock.now();
+ for (map<entity_addr_t,int32_t>::iterator p = mdsmap.standby.begin();
+ p != mdsmap.standby.end();
+ ++p )
+ if (last_beacon.count(p->first) == 0)
+ last_beacon[p->first] = g_clock.now();
+
return true;
}
// let's see.
int from = m->get_mds_inst().name.num();
+ entity_addr_t addr = m->get_mds_inst().addr;
int state = m->get_state();
version_t seq = m->get_seq();
// boot?
if (state == MDSMap::STATE_BOOT) {
// already booted?
- int already = mdsmap.get_addr_rank(m->get_mds_inst().addr);
- if (already < 0)
- return false; // need to update map
-
- // already booted. just reply to beacon, as per usual.
- from = already;
- }
+ if (pending_mdsmap.get_addr_rank(addr) == -1)
+ return false; // not booted|booting|standby yet
- // reply to beacon
- if (mdsmap.mds_state_seq[from] > seq) {
- dout(7) << "mds_beacon " << *m << " has old seq, ignoring" << dendl;
- delete m;
- return true;
+ // ignore.
+ goto out;
}
-
- // reply to beacon?
- if (state != MDSMap::STATE_STOPPED) {
- last_beacon[from] = g_clock.now(); // note time
- mon->messenger->send_message(new MMDSBeacon(m->get_mds_inst(), mdsmap.get_epoch(), state, seq),
- m->get_mds_inst());
+ else if (state == MDSMap::STATE_STANDBY) {
+ // standby?
+ if (!pending_mdsmap.is_standby(addr) &&
+ !mdsmap.is_standby(addr)) {
+ dout(7) << "mds_beacon " << *m << " claiming standby, but not, ignoring" << dendl;
+ goto out;
+ }
+ // reply.
}
-
- // is there a state change here?
- if (mdsmap.mds_state.count(from) == 0) {
- if (state == MDSMap::STATE_BOOT)
- return false; // need to add to map
- dout(1) << "mds_beacon " << *m << " announcing non-boot state, ignoring" << dendl;
- } else if (mdsmap.mds_state[from] != state) {
- if (mdsmap.get_epoch() == m->get_last_epoch_seen())
- return false; // need to update map
- dout(10) << "mds_beacon " << *m << " ignoring requested state, because mds hasn't seen latest map" << dendl;
+ else {
+ // old seq?
+ if (mdsmap.mds_state_seq[from] > seq) {
+ dout(7) << "mds_beacon " << *m << " has old seq, ignoring" << dendl;
+ goto out;
+ }
+
+ // is there a state change here?
+ if (mdsmap.mds_state.count(from) == 0) {
+ dout(1) << "mds_beacon " << *m << " announcing non-boot|standby state, ignoring" << dendl;
+ goto out;
+ }
+
+ if (mdsmap.mds_state[from] != state) {
+ if (mdsmap.get_epoch() == m->get_last_epoch_seen())
+ return false; // need to update map
+ dout(10) << "mds_beacon " << *m << " ignoring requested state, because mds hasn't seen latest map" << dendl;
+ }
}
-
- // we're done.
+
+ // note time and reply
+ dout(15) << "mds_beacon " << *m << " noting time and replying" << dendl;
+ last_beacon[addr] = g_clock.now();
+ mon->messenger->send_message(new MMDSBeacon(m->get_mds_inst(), mdsmap.get_epoch(), state, seq, 0),
+ m->get_mds_inst());
+
+ // done
+ out:
delete m;
return true;
}
<< " from " << m->get_mds_inst()
<< dendl;
int from = m->get_mds_inst().name.num();
+ entity_addr_t addr = m->get_mds_inst().addr;
int state = m->get_state();
version_t seq = m->get_seq();
- assert(state != mdsmap.get_state(from));
-
// boot?
+ int standby_for = -1;
if (state == MDSMap::STATE_BOOT) {
- // assign a name.
- if (from >= 0) {
- // wants to be (or already is) a specific MDS.
- if (!g_conf.mon_allow_mds_bully &&
- (!mdsmap.have_inst(from) || mdsmap.get_inst(from) != m->get_mds_inst())) {
- dout(10) << "mds_beacon boot: mds" << from << " is someone else" << dendl;
- from = -1;
- } else {
- switch (mdsmap.get_state(from)) {
- case MDSMap::STATE_STOPPED:
- case MDSMap::STATE_STARTING:
- case MDSMap::STATE_STANDBY:
- state = MDSMap::STATE_STARTING;
- break;
- case MDSMap::STATE_DNE:
- case MDSMap::STATE_CREATING:
- state = MDSMap::STATE_CREATING;
- break;
- case MDSMap::STATE_FAILED:
- default:
- state = MDSMap::STATE_REPLAY;
- break;
- }
- dout(10) << "mds_beacon boot: mds" << from
- << " was " << MDSMap::get_state_name(mdsmap.get_state(from))
- << ", " << MDSMap::get_state_name(state)
- << dendl;
- }
+ from = -1;
+
+ // standby for a given rank?
+ standby_for = m->get_want_rank();
+ if (standby_for >= pending_mdsmap.max_mds) {
+ dout(10) << "mds_beacon boot: wanted standby for mds" << from
+ << " >= max_mds " << pending_mdsmap.max_mds
+ << ", will be shared standby" << dendl;
+ standby_for = -1;
}
- if (from < 0) {
- from = pending_mdsmap.get_addr_rank(m->get_mds_inst().addr);
- if (from >= 0) {
- state = pending_mdsmap.mds_state[from];
- dout(10) << "mds_beacon boot: already pending mds" << from
- << " " << MDSMap::get_state_name(state) << dendl;
- delete m;
- return false;
+ if (standby_for >= 0 && pending_mdsmap.is_down(standby_for)) {
+ // wants to be a specific MDS, who is down
+ from = standby_for;
+ switch (pending_mdsmap.get_state(standby_for)) {
+ case MDSMap::STATE_STOPPED:
+ state = MDSMap::STATE_STARTING;
+ break;
+ case MDSMap::STATE_DNE:
+ state = MDSMap::STATE_CREATING;
+ break;
+ case MDSMap::STATE_FAILED:
+ state = MDSMap::STATE_REPLAY;
+ break;
+ default:
+ assert(0);
}
+ dout(10) << "mds_beacon boot: mds" << from
+ << " was " << MDSMap::get_state_name(pending_mdsmap.get_state(from))
+ << ", " << MDSMap::get_state_name(state)
+ << dendl;
}
- if (from < 0) {
- // pick a failed mds?
+ else if (standby_for < 0) {
+ // pick another failed mds?
set<int> failed;
pending_mdsmap.get_failed_mds_set(failed);
if (!failed.empty()) {
state = MDSMap::STATE_REPLAY;
}
}
- if (from < 0) {
- // ok, just pick any unused mds id.
- for (from=0; ; ++from) {
- if (pending_mdsmap.is_dne(from)) {
+ if (from < 0 && standby_for < 0 &&
+ !pending_mdsmap.is_degraded()) {
+ // ok, just pick any unused mds rank
+ // that doesn't make us overfull
+ for (int i=0; i<pending_mdsmap.max_mds; i++) {
+ if (pending_mdsmap.would_be_overfull_with(i)) continue;
+ if (pending_mdsmap.is_dne(i)) {
+ from = i;
dout(10) << "mds_beacon boot: assigned new mds" << from << dendl;
state = MDSMap::STATE_CREATING;
break;
- } else if (pending_mdsmap.is_stopped(from)) {
+ } else if (pending_mdsmap.is_stopped(i)) {
+ from = i;
dout(10) << "mds_beacon boot: assigned stopped mds" << from << dendl;
state = MDSMap::STATE_STARTING;
break;
}
}
}
-
- assert(state == MDSMap::STATE_CREATING ||
- state == MDSMap::STATE_STARTING ||
- state == MDSMap::STATE_REPLAY);
-
- // put it in the map.
- pending_mdsmap.mds_inst[from].addr = m->get_mds_inst().addr;
- pending_mdsmap.mds_inst[from].name = entity_name_t::MDS(from);
- pending_mdsmap.mds_inc[from]++;
-
- // reset the beacon timer
- last_beacon[from] = g_clock.now();
-
- // if starting|creating and degraded|full, go to standby
- if ((state == MDSMap::STATE_CREATING || state == MDSMap::STATE_STARTING) &&
- (pending_mdsmap.would_be_overfull_with(from) ||
- pending_mdsmap.is_degraded())) {
- dout(10) << "mds_beacon cluster full, mds" << from << " will be standby" << dendl;
+
+ if (from < 0) {
+ // standby
+ if (standby_for < 0) {
+ dout(10) << "mds_beacon boot: standby for any" << dendl;
+ pending_mdsmap.standby_any.insert(addr);
+ } else {
+ dout(10) << "mds_beacon boot: standby for mds" << standby_for << dendl;
+ pending_mdsmap.standby_for[standby_for].insert(addr);
+ }
+ pending_mdsmap.standby[addr] = standby_for;
state = MDSMap::STATE_STANDBY;
+ } else {
+ // join|takeover
+ assert(state == MDSMap::STATE_CREATING ||
+ state == MDSMap::STATE_STARTING ||
+ state == MDSMap::STATE_REPLAY);
+
+ pending_mdsmap.mds_inst[from].addr = addr;
+ pending_mdsmap.mds_inst[from].name = entity_name_t::MDS(from);
+ pending_mdsmap.mds_inc[from]++;
+ pending_mdsmap.mds_state[from] = state;
+ pending_mdsmap.mds_state_seq[from] = seq;
}
- }
- // created?
- if (state == MDSMap::STATE_ACTIVE &&
- mdsmap.is_creating(from)) {
- pending_mdsmap.mds_created.insert(from);
- dout(10) << "mds_beacon created mds" << from << dendl;
- }
-
- // update the map
- dout(10) << "mds_beacon mds" << from << " " << MDSMap::get_state_name(mdsmap.mds_state[from])
- << " -> " << MDSMap::get_state_name(state)
- << dendl;
+ // initialize the beacon timer
+ last_beacon[addr] = g_clock.now();
- // has someone join or leave the cluster?
- if (state == MDSMap::STATE_REPLAY ||
- state == MDSMap::STATE_ACTIVE ||
- state == MDSMap::STATE_STOPPED) {
- pending_mdsmap.same_in_set_since = pending_mdsmap.epoch;
+ } else {
+ // state change
+ dout(10) << "mds_beacon mds" << from << " " << MDSMap::get_state_name(mdsmap.mds_state[from])
+ << " -> " << MDSMap::get_state_name(state)
+ << dendl;
+
+ // change the state
+ pending_mdsmap.mds_state[from] = state;
+ if (pending_mdsmap.is_up(from))
+ pending_mdsmap.mds_state_seq[from] = seq;
+ else
+ pending_mdsmap.mds_state_seq.erase(from);
}
-
- // change the state
- pending_mdsmap.mds_state[from] = state;
- if (pending_mdsmap.is_up(from))
- pending_mdsmap.mds_state_seq[from] = seq;
- else
- pending_mdsmap.mds_state_seq.erase(from);
-
+
dout(7) << "pending map now:" << dendl;
print_map(pending_mdsmap);
-
+
paxos->wait_for_commit(new C_Updated(this, from, m));
return true;
void MDSMonitor::_updated(int from, MMDSBeacon *m)
{
- if (m->get_state() == MDSMap::STATE_BOOT) {
+ if (from < 0) {
dout(10) << "_updated (booted) mds" << from << " " << *m << dendl;
- mon->osdmon->send_latest(mdsmap.get_inst(from));
+ mon->osdmon->send_latest(m->get_source_inst());
} else {
dout(10) << "_updated mds" << from << " " << *m << dendl;
}
void MDSMonitor::committed()
{
// check for failed
- set<int> standby;
set<int> failed;
- mdsmap.get_mds_set(standby, MDSMap::STATE_STANDBY);
mdsmap.get_failed_mds_set(failed);
- if (!standby.empty() && !failed.empty()) {
- while (!standby.empty() && !failed.empty()) {
- int f = *failed.begin();
- int t = *standby.begin();
- failed.erase(failed.begin());
- standby.erase(standby.begin());
-
- dout(0) << "mds" << t << " taking over for mds" << f << dendl;
-
- // send new map to old inst/name
- waiting_for_map.push_back(mdsmap.mds_inst[t]);
+ if (!mdsmap.standby.empty() && !failed.empty()) {
+ bool didtakeover = false;
+ set<int>::iterator p = failed.begin();
+ while (p != failed.end()) {
+ int f = *p++;
- pending_mdsmap.mds_inst[f] = mdsmap.mds_inst[t];
- pending_mdsmap.mds_inst[f].name = entity_name_t::MDS(f);
- pending_mdsmap.mds_inc[f]++;
- pending_mdsmap.mds_state[f] = MDSMap::STATE_REPLAY;
- pending_mdsmap.mds_state_seq[f] = mdsmap.mds_state_seq[t];
-
- pending_mdsmap.mds_inst.erase(t);
- pending_mdsmap.mds_state.erase(t);
- pending_mdsmap.mds_state_seq.erase(t);
-
- last_beacon[f] = last_beacon[t];
- last_beacon.erase(t);
+ // someone standby for me?
+ if (mdsmap.standby_for.count(f) &&
+ !mdsmap.standby_for[f].empty()) {
+ dout(0) << "mds" << f << " standby " << *mdsmap.standby_for[f].begin() << " taking over" << dendl;
+ take_over(*mdsmap.standby_for[f].begin(), f);
+ didtakeover = true;
+ }
+ else if (!mdsmap.standby_any.empty()) {
+ dout(0) << "standby " << mdsmap.standby.begin()->first << " taking over for mds" << f << dendl;
+ take_over(mdsmap.standby.begin()->first, f);
+ didtakeover = true;
+ }
+ }
+ if (didtakeover) {
+ dout(7) << "pending map now:" << dendl;
+ print_map(pending_mdsmap);
+ propose_pending();
}
-
- dout(7) << "pending map now:" << dendl;
- print_map(pending_mdsmap);
-
- propose_pending();
}
// hackish: did all mds's shut down?
mon->monmap->get_inst(mon->whoami));
}
+void MDSMonitor::take_over(entity_addr_t addr, int mds)
+{
+ pending_mdsmap.mds_inst[mds].addr = addr;
+ pending_mdsmap.mds_inst[mds].name = entity_name_t::MDS(mds);
+ pending_mdsmap.mds_inc[mds]++;
+ pending_mdsmap.mds_state[mds] = MDSMap::STATE_REPLAY;
+ pending_mdsmap.mds_state_seq[mds] = 0;
+
+ // remove from standby list(s)
+ pending_mdsmap.standby.erase(addr);
+ pending_mdsmap.standby_for[mds].erase(addr);
+ pending_mdsmap.standby_any.erase(addr);
+
+ // send new map to old inst/name
+ entity_inst_t oldinst;
+ oldinst.name = entity_name_t::MDS(-2);
+ oldinst.addr = addr;
+ waiting_for_map.push_back(oldinst);
+}
+
+
bool MDSMonitor::handle_command(MMonCommand *m)
{
ss << "mds" << who << " not active (" << mdsmap.get_state_name(mdsmap.get_state(who)) << ")";
}
}
- else if (m->cmd[1] == "set_target_num" && m->cmd.size() > 2) {
- pending_mdsmap.target_num = atoi(m->cmd[2].c_str());
+ else if (m->cmd[1] == "set_max_mds" && m->cmd.size() > 2) {
+ pending_mdsmap.max_mds = atoi(m->cmd[2].c_str());
r = 0;
- ss << "target_num = " << pending_mdsmap.target_num;
+ ss << "max_mds = " << pending_mdsmap.max_mds;
}
}
if (r == -EINVAL) {
p != up.end();
p++)
send_full(mdsmap.get_inst(*p));
+
+ // standby too
+ entity_inst_t inst;
+ inst.name = entity_name_t::MDS(-1);
+ for (map<entity_addr_t,int32_t>::iterator p = mdsmap.standby.begin();
+ p != mdsmap.standby.end();
+ p++) {
+ inst.addr = p->first;
+ send_full(inst);
+ }
}
void MDSMonitor::send_full(entity_inst_t dest)
void MDSMonitor::tick()
{
// make sure mds's are still alive
- utime_t now = g_clock.now();
-
// ...if i am an active leader
if (!mon->is_leader()) return;
if (!paxos->is_active()) return;
- if (now > g_conf.mds_beacon_grace) {
- utime_t cutoff = now;
- cutoff -= g_conf.mds_beacon_grace;
+ utime_t cutoff = g_clock.now();
+ cutoff -= g_conf.mds_beacon_grace;
- bool changed = false;
-
- set<int> up;
- mdsmap.get_up_mds_set(up);
-
- for (set<int>::iterator p = up.begin();
- p != up.end();
- ++p) {
- if (last_beacon.count(*p)) {
- if (last_beacon[*p] < cutoff) {
-
- // failure!
- int newstate;
- switch (mdsmap.get_state(*p)) {
- case MDSMap::STATE_STANDBY:
- if (mdsmap.has_created(*p))
- newstate = MDSMap::STATE_STOPPED;
- else
- newstate = MDSMap::STATE_DNE;
- break;
-
- case MDSMap::STATE_CREATING:
- // didn't finish creating
- newstate = MDSMap::STATE_DNE;
- break;
-
- case MDSMap::STATE_STARTING:
- newstate = MDSMap::STATE_STOPPED;
- break;
-
- case MDSMap::STATE_REPLAY:
- case MDSMap::STATE_RESOLVE:
- case MDSMap::STATE_RECONNECT:
- case MDSMap::STATE_REJOIN:
- case MDSMap::STATE_ACTIVE:
- case MDSMap::STATE_STOPPING:
- newstate = MDSMap::STATE_FAILED;
- break;
-
- default:
- assert(0);
- }
-
- dout(10) << "no beacon from mds" << *p << " since " << last_beacon[*p]
- << ", marking " << mdsmap.get_state_name(newstate)
- << dendl;
-
- // update map
- pending_mdsmap.mds_state[*p] = newstate;
- pending_mdsmap.mds_state_seq.erase(*p);
- changed = true;
- }
- } else {
- dout(10) << "no beacons from mds" << *p << ", assuming one " << now << dendl;
- last_beacon[*p] = now;
+ map<entity_addr_t, utime_t>::iterator p = last_beacon.begin();
+ while (p != last_beacon.end()) {
+ entity_addr_t addr = p->first;
+ p++;
+
+ if (last_beacon[addr] >= cutoff) continue;
+
+ int mds = pending_mdsmap.get_addr_rank(addr);
+ if (mds >= 0) {
+ // failure!
+ int newstate;
+ switch (pending_mdsmap.get_state(mds)) {
+ case MDSMap::STATE_CREATING:
+ newstate = MDSMap::STATE_DNE; // didn't finish creating
+ break;
+
+ case MDSMap::STATE_STARTING:
+ newstate = MDSMap::STATE_STOPPED;
+ break;
+
+ case MDSMap::STATE_REPLAY:
+ case MDSMap::STATE_RESOLVE:
+ case MDSMap::STATE_RECONNECT:
+ case MDSMap::STATE_REJOIN:
+ case MDSMap::STATE_ACTIVE:
+ case MDSMap::STATE_STOPPING:
+ newstate = MDSMap::STATE_FAILED;
+ break;
+
+ default:
+ assert(0);
}
+
+ dout(10) << "no beacon from mds" << *p << " since " << last_beacon[addr]
+ << ", marking " << pending_mdsmap.get_state_name(newstate)
+ << dendl;
+
+ // update map
+ pending_mdsmap.mds_state[mds] = newstate;
+ pending_mdsmap.mds_state_seq.erase(mds);
+ }
+ else if (pending_mdsmap.is_standby(addr)) {
+ dout(10) << "no beacon from standby " << addr << " since " << last_beacon[addr]
+ << ", removing from standby list"
+ << dendl;
+ if (pending_mdsmap.standby[addr] >= 0)
+ pending_mdsmap.standby_for[pending_mdsmap.standby[addr]].erase(addr);
+ else
+ pending_mdsmap.standby_any.erase(addr);
+ pending_mdsmap.standby.erase(addr);
+ }
+ else {
+ dout(0) << "BUG: removing stray " << addr << " from last_beacon map" << dendl;
}
- if (changed)
- propose_pending();
+ last_beacon.erase(addr);
+ propose_pending();
}
}
pending_mdsmap.mds_state[p->first] = MDSMap::STATE_STOPPING;
break;
case MDSMap::STATE_CREATING:
- case MDSMap::STATE_STANDBY:
pending_mdsmap.mds_state[p->first] = MDSMap::STATE_DNE;
break;
case MDSMap::STATE_STARTING:
break;
}
}
+ // hose standby list
+ pending_mdsmap.standby.clear();
+ pending_mdsmap.standby_for.clear();
+ pending_mdsmap.standby_any.clear();
propose_pending();
}