From 34d1396c1fe8cd81cabae1e7351250d297f17277 Mon Sep 17 00:00:00 2001 From: sageweil Date: Sun, 21 Oct 2007 21:43:48 +0000 Subject: [PATCH] reworked mdsmon, beacon handling, mdsmap standby queues git-svn-id: https://ceph.svn.sf.net/svnroot/ceph@1975 29311d96-e01e-0410-9327-a35deaab8ce9 --- trunk/ceph/mds/MDS.cc | 26 +- trunk/ceph/mds/MDS.h | 1 + trunk/ceph/mds/MDSMap.h | 135 +++++---- trunk/ceph/messages/MMDSBeacon.h | 8 +- trunk/ceph/mon/MDSMonitor.cc | 491 +++++++++++++++++-------------- trunk/ceph/mon/MDSMonitor.h | 4 +- 6 files changed, 375 insertions(+), 290 deletions(-) diff --git a/trunk/ceph/mds/MDS.cc b/trunk/ceph/mds/MDS.cc index 6fc8ef46d9039..e82e81c26496b 100644 --- a/trunk/ceph/mds/MDS.cc +++ b/trunk/ceph/mds/MDS.cc @@ -329,13 +329,15 @@ int MDS::init(bool standby) { mds_lock.Lock(); - objecter->init(); - - want_state = MDSMap::STATE_BOOT; - // starting beacon. this will induce an MDSMap from the monitor + want_state = MDSMap::STATE_BOOT; + want_rank = whoami; beacon_start(); - + whoami = -1; + messenger->reset_myname(entity_name_t::MDS(whoami)); + + objecter->init(); + // schedule tick reset_tick(); @@ -413,7 +415,7 @@ void MDS::beacon_send() int mon = monmap->pick_mon(); messenger->send_message(new MMDSBeacon(messenger->get_myinst(), mdsmap->get_epoch(), - want_state, beacon_last_seq), + want_state, beacon_last_seq, want_rank), monmap->get_inst(mon)); // schedule next sender @@ -510,15 +512,22 @@ void MDS::handle_mds_map(MMDSMap *m) // see who i am whoami = mdsmap->get_addr_rank(messenger->get_myaddr()); if (whoami < 0) { + if (mdsmap->is_standby(messenger->get_myaddr())) { + if (state != MDSMap::STATE_STANDBY) { + want_state = state = MDSMap::STATE_STANDBY; + dout(1) << "handle_mds_map standby" << dendl; + } + goto out; + } dout(1) << "handle_mds_map i'm not in the mdsmap, killing myself" << dendl; suicide(); - return; + goto out; } // open logger? // note that fakesyn/newsyn starts knowing who they are if (whoami >= 0 && - mdsmap->is_up(whoami) && !mdsmap->is_standby(whoami) && + mdsmap->is_up(whoami) && (oldwhoami != whoami || !logger)) reopen_logger(mdsmap->get_create()); // adopt mds cluster timeline @@ -666,6 +675,7 @@ void MDS::handle_mds_map(MMDSMap *m) beacon_send(); } + out: delete m; delete oldmap; } diff --git a/trunk/ceph/mds/MDS.h b/trunk/ceph/mds/MDS.h index 4dcd73662dbe8..ff032f7f8de1a 100644 --- a/trunk/ceph/mds/MDS.h +++ b/trunk/ceph/mds/MDS.h @@ -112,6 +112,7 @@ class MDS : public Dispatcher { // -- MDS state -- int state; // my confirmed state int want_state; // the state i want + int want_rank; // the mds rank i want list waiting_for_active; map > waiting_for_active_peer; diff --git a/trunk/ceph/mds/MDSMap.h b/trunk/ceph/mds/MDSMap.h index f2b31ca0fd1c1..9a4371609c7eb 100644 --- a/trunk/ceph/mds/MDSMap.h +++ b/trunk/ceph/mds/MDSMap.h @@ -44,9 +44,28 @@ using namespace std; \ | \ v \-- STOPPING - + new states: + + boot --> standby, creating, or starting. + + + dne ----> creating -----> active* + ^ ^___________/ / ^ ^ + | / / | + destroying / / | + ^ / / | + | / / | + stopped <---- stopping* <-/ / | + \ / | + ----- starting* ----/ | + | + failed | + \ | + \--> replay* --> reconnect* --> rejoin* + + * = can fail */ @@ -54,41 +73,44 @@ using namespace std; class MDSMap { public: // mds states - static const int STATE_DNE = 0; // down, never existed. - static const int STATE_STOPPED = -1; // down, once existed, but no subtrees. empty log. - static const int STATE_FAILED = 2; // down, active subtrees; needs to be recovered. - - static const int STATE_BOOT = -3; // up, boot announcement. destiny unknown. - static const int STATE_STANDBY = -4; // up, idle. waiting for assignment by monitor. - static const int STATE_CREATING = -5; // up, creating MDS instance (new journal, idalloc..). - static const int STATE_STARTING = -6; // up, starting prior stopped MDS instance. - - static const int STATE_REPLAY = 7; // up, starting prior failed instance. scanning journal. - static const int STATE_RESOLVE = 8; // up, disambiguating distributed operations (import, rename, etc.) - static const int STATE_RECONNECT = 9; // up, reconnect to clients - static const int STATE_REJOIN = 10; // up, replayed journal, rejoining distributed cache - static const int STATE_ACTIVE = 11; // up, active - static const int STATE_STOPPING = 12; // up, exporting metadata (-> standby or out) + static const int STATE_DNE = 0; // down, never existed. + static const int STATE_DESTROYING = -1; // down, existing, semi-destroyed. + static const int STATE_STOPPED = -2; // down, once existed, but no subtrees. empty log. + static const int STATE_FAILED = 3; // down, active subtrees; needs to be recovered. + + static const int STATE_BOOT = -4; // up, boot announcement. destiny unknown. + static const int STATE_STANDBY = -5; // up, idle. waiting for assignment by monitor. + + static const int STATE_CREATING = -6; // up, creating MDS instance (new journal, idalloc..). + static const int STATE_STARTING = -7; // up, starting prior stopped MDS instance. + + static const int STATE_REPLAY = 8; // up, starting prior failed instance. scanning journal. + static const int STATE_RESOLVE = 9; // up, disambiguating distributed operations (import, rename, etc.) + static const int STATE_RECONNECT = 10; // up, reconnect to clients + static const int STATE_REJOIN = 11; // up, replayed journal, rejoining distributed cache + static const int STATE_ACTIVE = 12; // up, active + static const int STATE_STOPPING = 13; // up, exporting metadata (-> standby or out) static const char *get_state_name(int s) { switch (s) { // down and out - case STATE_DNE: return "down:dne"; - case STATE_STOPPED: return "down:stopped"; + case STATE_DNE: return "down:dne"; + case STATE_DESTROYING: return "down:destroying"; + case STATE_STOPPED: return "down:stopped"; // down and in - case STATE_FAILED: return "down:failed"; + case STATE_FAILED: return "down:failed"; // up and out - case STATE_BOOT: return "up:boot"; - case STATE_CREATING: return "up:creating"; - case STATE_STARTING: return "up:starting"; - case STATE_STANDBY: return "up:standby"; + case STATE_BOOT: return "up:boot"; + case STATE_STANDBY: return "up:standby"; + case STATE_CREATING: return "up:creating"; + case STATE_STARTING: return "up:starting"; // up and in - case STATE_REPLAY: return "up:replay"; - case STATE_RESOLVE: return "up:resolve"; - case STATE_RECONNECT: return "up:reconnect"; - case STATE_REJOIN: return "up:rejoin"; - case STATE_ACTIVE: return "up:active"; - case STATE_STOPPING: return "up:stopping"; + case STATE_REPLAY: return "up:replay"; + case STATE_RESOLVE: return "up:resolve"; + case STATE_RECONNECT: return "up:reconnect"; + case STATE_REJOIN: return "up:rejoin"; + case STATE_ACTIVE: return "up:active"; + case STATE_STOPPING: return "up:stopping"; default: assert(0); } return 0; @@ -96,29 +118,31 @@ class MDSMap { protected: epoch_t epoch; + epoch_t client_epoch; // incremented only when change is significant to client. utime_t created; - epoch_t same_in_set_since; // note: this does not reflect exit-by-failure. - int target_num; - int anchortable; // which MDS has anchortable (fixme someday) - int root; // which MDS has root directory + int32_t max_mds; + int32_t anchortable; // which MDS has anchortable (fixme someday) + int32_t root; // which MDS has root directory + + map mds_state; // MDS state + map mds_state_seq; + map mds_inst; // up instances + map mds_inc; // incarnation count (monotonically increases) - set mds_created; // which mds ids have initialized journals and id tables. - map mds_state; // MDS state - map mds_state_seq; - map mds_inst; // up instances - map mds_inc; // incarnation count (monotonically increases) + map standby; // -1 == any + map > standby_for; + set standby_any; friend class MDSMonitor; public: - MDSMap() : epoch(0), same_in_set_since(0), anchortable(0), root(0) {} + MDSMap() : epoch(0), client_epoch(0), anchortable(0), root(0) {} epoch_t get_epoch() const { return epoch; } void inc_epoch() { epoch++; } const utime_t& get_create() const { return created; } - epoch_t get_same_in_set_since() const { return same_in_set_since; } int get_anchortable() const { return anchortable; } int get_root() const { return root; } @@ -209,7 +233,6 @@ class MDSMap { bool is_failed(int m) { return mds_state.count(m) && mds_state[m] == STATE_FAILED; } bool is_boot(int m) { return mds_state.count(m) && mds_state[m] == STATE_BOOT; } - bool is_standby(int m) { return mds_state.count(m) && mds_state[m] == STATE_STANDBY; } bool is_creating(int m) { return mds_state.count(m) && mds_state[m] == STATE_CREATING; } bool is_starting(int m) { return mds_state.count(m) && mds_state[m] == STATE_STARTING; } bool is_replay(int m) { return mds_state.count(m) && mds_state[m] == STATE_REPLAY; } @@ -221,11 +244,11 @@ class MDSMap { bool is_active_or_stopping(int m) { return is_active(m) || is_stopping(m); } bool is_stopped(int m) { return mds_state.count(m) && mds_state[m] == STATE_STOPPED; } - bool has_created(int m) { return mds_created.count(m); } + bool is_standby(entity_addr_t a) { return standby.count(a); } // cluster states bool is_full() { - return get_num_in_mds() >= target_num; + return get_num_in_mds() >= max_mds; } bool is_degraded() { // degraded = some recovery in process. fixes active membership and recovery_set. return @@ -263,7 +286,7 @@ class MDSMap { p->second == STATE_CREATING) in++; } - return (in > target_num); + return (in > max_mds); } int get_state(int m) { @@ -295,14 +318,8 @@ class MDSMap { ++p) { if (p->second.addr == addr) return p->first; } - /*else - for (map::iterator p = mds_inst.begin(); - p != mds_inst.end(); - ++p) { - if (memcmp(&p->second.addr,&inst.addr, sizeof(inst.addr)) == 0) return p->first; - } - */ - + if (standby.count(addr)) + return -2; return -1; } @@ -323,29 +340,35 @@ class MDSMap { // serialize, unserialize void encode(bufferlist& bl) { ::_encode(epoch, bl); - ::_encode(target_num, bl); + ::_encode(client_epoch, bl); ::_encode(created, bl); - ::_encode(same_in_set_since, bl); ::_encode(anchortable, bl); ::_encode(root, bl); + ::_encode(max_mds, bl); ::_encode(mds_state, bl); ::_encode(mds_state_seq, bl); ::_encode(mds_inst, bl); ::_encode(mds_inc, bl); + ::_encode(standby, bl); + ::_encode(standby_for, bl); + ::_encode(standby_any, bl); } void decode(bufferlist& bl) { int off = 0; ::_decode(epoch, bl, off); - ::_decode(target_num, bl, off); + ::_decode(client_epoch, bl, off); ::_decode(created, bl, off); - ::_decode(same_in_set_since, bl, off); ::_decode(anchortable, bl, off); ::_decode(root, bl, off); + ::_decode(max_mds, bl, off); ::_decode(mds_state, bl, off); ::_decode(mds_state_seq, bl, off); ::_decode(mds_inst, bl, off); ::_decode(mds_inc, bl, off); + ::_decode(standby, bl, off); + ::_decode(standby_for, bl, off); + ::_decode(standby_any, bl, off); } diff --git a/trunk/ceph/messages/MMDSBeacon.h b/trunk/ceph/messages/MMDSBeacon.h index c18a05e77f1a8..3a2a90f49152f 100644 --- a/trunk/ceph/messages/MMDSBeacon.h +++ b/trunk/ceph/messages/MMDSBeacon.h @@ -26,18 +26,20 @@ class MMDSBeacon : public Message { epoch_t last_epoch_seen; // include last mdsmap epoch mds has seen to avoid race with monitor decree int state; version_t seq; + int want_rank; public: MMDSBeacon() : Message(MSG_MDS_BEACON) {} - MMDSBeacon(entity_inst_t i, epoch_t les, int st, version_t se) : + MMDSBeacon(entity_inst_t i, epoch_t les, int st, version_t se, int wr) : Message(MSG_MDS_BEACON), - inst(i), last_epoch_seen(les), state(st), seq(se) { } + inst(i), last_epoch_seen(les), state(st), seq(se), want_rank(wr) { } entity_inst_t& get_mds_inst() { return inst; } epoch_t get_last_epoch_seen() { return last_epoch_seen; } int get_state() { return state; } version_t get_seq() { return seq; } char *get_type_name() { return "mdsbeacon"; } + int get_want_rank() { return want_rank; } void print(ostream& out) { out << "mdsbeacon(" << inst @@ -50,6 +52,7 @@ class MMDSBeacon : public Message { ::_encode(last_epoch_seen, payload); ::_encode(state, payload); ::_encode(seq, payload); + ::_encode(want_rank, payload); } void decode_payload() { int off = 0; @@ -57,6 +60,7 @@ class MMDSBeacon : public Message { ::_decode(last_epoch_seen, payload, off); ::_decode(state, payload, off); ::_decode(seq, payload, off); + ::_decode(want_rank, payload, off); } }; diff --git a/trunk/ceph/mon/MDSMonitor.cc b/trunk/ceph/mon/MDSMonitor.cc index 645f029f6b203..42edd63f13cc6 100644 --- a/trunk/ceph/mon/MDSMonitor.cc +++ b/trunk/ceph/mon/MDSMonitor.cc @@ -43,17 +43,29 @@ void MDSMonitor::print_map(MDSMap &m) { - dout(7) << "print_map epoch " << m.get_epoch() << " target_num " << m.target_num << dendl; + dout(7) << "print_map epoch " << m.get_epoch() << " max " << m.max_mds << dendl; entity_inst_t blank; set all; m.get_mds_set(all); for (set::iterator p = all.begin(); p != all.end(); ++p) { - dout(7) << " mds" << *p << "." << m.mds_inc[*p] - << " : " << MDSMap::get_state_name(m.get_state(*p)) - << " : " << (m.have_inst(*p) ? m.get_inst(*p) : blank) - << dendl; + if (m.standby_for.count(*p) && !m.standby_for[*p].empty()) { + dout(7) << " mds" << *p << "." << m.mds_inc[*p] + << " : " << MDSMap::get_state_name(m.get_state(*p)) + << " : " << (m.have_inst(*p) ? m.get_inst(*p) : blank) + << " : +" << m.standby_for[*p].size() + << " standby " << m.standby_for[*p] + << dendl; + } else { + dout(7) << " mds" << *p << "." << m.mds_inc[*p] + << " : " << MDSMap::get_state_name(m.get_state(*p)) + << " : " << (m.have_inst(*p) ? m.get_inst(*p) : blank) + << dendl; + } + } + if (!m.standby_any.empty()) { + dout(7) << " +" << m.standby_any.size() << " shared standby " << m.standby_any << dendl; } } @@ -64,7 +76,7 @@ void MDSMonitor::print_map(MDSMap &m) void MDSMonitor::create_initial() { dout(10) << "create_initial" << dendl; - pending_mdsmap.target_num = g_conf.num_mds; + pending_mdsmap.max_mds = g_conf.num_mds; pending_mdsmap.created = g_clock.now(); print_map(pending_mdsmap); } @@ -96,6 +108,18 @@ bool MDSMonitor::update_from_paxos() bcast_latest_mds(); send_to_waiting(); + // make sure last_beacon is populated + for (map::iterator p = mdsmap.mds_inst.begin(); + p != mdsmap.mds_inst.end(); + ++p) + if (last_beacon.count(p->second.addr) == 0) + last_beacon[p->second.addr] = g_clock.now(); + for (map::iterator p = mdsmap.standby.begin(); + p != mdsmap.standby.end(); + ++p ) + if (last_beacon.count(p->first) == 0) + last_beacon[p->first] = g_clock.now(); + return true; } @@ -157,6 +181,7 @@ bool MDSMonitor::preprocess_beacon(MMDSBeacon *m) // let's see. int from = m->get_mds_inst().name.num(); + entity_addr_t addr = m->get_mds_inst().addr; int state = m->get_state(); version_t seq = m->get_seq(); @@ -165,40 +190,49 @@ bool MDSMonitor::preprocess_beacon(MMDSBeacon *m) // boot? if (state == MDSMap::STATE_BOOT) { // already booted? - int already = mdsmap.get_addr_rank(m->get_mds_inst().addr); - if (already < 0) - return false; // need to update map - - // already booted. just reply to beacon, as per usual. - from = already; - } + if (pending_mdsmap.get_addr_rank(addr) == -1) + return false; // not booted|booting|standby yet - // reply to beacon - if (mdsmap.mds_state_seq[from] > seq) { - dout(7) << "mds_beacon " << *m << " has old seq, ignoring" << dendl; - delete m; - return true; + // ignore. + goto out; } - - // reply to beacon? - if (state != MDSMap::STATE_STOPPED) { - last_beacon[from] = g_clock.now(); // note time - mon->messenger->send_message(new MMDSBeacon(m->get_mds_inst(), mdsmap.get_epoch(), state, seq), - m->get_mds_inst()); + else if (state == MDSMap::STATE_STANDBY) { + // standby? + if (!pending_mdsmap.is_standby(addr) && + !mdsmap.is_standby(addr)) { + dout(7) << "mds_beacon " << *m << " claiming standby, but not, ignoring" << dendl; + goto out; + } + // reply. } - - // is there a state change here? - if (mdsmap.mds_state.count(from) == 0) { - if (state == MDSMap::STATE_BOOT) - return false; // need to add to map - dout(1) << "mds_beacon " << *m << " announcing non-boot state, ignoring" << dendl; - } else if (mdsmap.mds_state[from] != state) { - if (mdsmap.get_epoch() == m->get_last_epoch_seen()) - return false; // need to update map - dout(10) << "mds_beacon " << *m << " ignoring requested state, because mds hasn't seen latest map" << dendl; + else { + // old seq? + if (mdsmap.mds_state_seq[from] > seq) { + dout(7) << "mds_beacon " << *m << " has old seq, ignoring" << dendl; + goto out; + } + + // is there a state change here? + if (mdsmap.mds_state.count(from) == 0) { + dout(1) << "mds_beacon " << *m << " announcing non-boot|standby state, ignoring" << dendl; + goto out; + } + + if (mdsmap.mds_state[from] != state) { + if (mdsmap.get_epoch() == m->get_last_epoch_seen()) + return false; // need to update map + dout(10) << "mds_beacon " << *m << " ignoring requested state, because mds hasn't seen latest map" << dendl; + } } - - // we're done. + + // note time and reply + dout(15) << "mds_beacon " << *m << " noting time and replying" << dendl; + last_beacon[addr] = g_clock.now(); + mon->messenger->send_message(new MMDSBeacon(m->get_mds_inst(), mdsmap.get_epoch(), state, seq, 0), + m->get_mds_inst()); + + // done + out: delete m; return true; } @@ -233,54 +267,46 @@ bool MDSMonitor::handle_beacon(MMDSBeacon *m) << " from " << m->get_mds_inst() << dendl; int from = m->get_mds_inst().name.num(); + entity_addr_t addr = m->get_mds_inst().addr; int state = m->get_state(); version_t seq = m->get_seq(); - assert(state != mdsmap.get_state(from)); - // boot? + int standby_for = -1; if (state == MDSMap::STATE_BOOT) { - // assign a name. - if (from >= 0) { - // wants to be (or already is) a specific MDS. - if (!g_conf.mon_allow_mds_bully && - (!mdsmap.have_inst(from) || mdsmap.get_inst(from) != m->get_mds_inst())) { - dout(10) << "mds_beacon boot: mds" << from << " is someone else" << dendl; - from = -1; - } else { - switch (mdsmap.get_state(from)) { - case MDSMap::STATE_STOPPED: - case MDSMap::STATE_STARTING: - case MDSMap::STATE_STANDBY: - state = MDSMap::STATE_STARTING; - break; - case MDSMap::STATE_DNE: - case MDSMap::STATE_CREATING: - state = MDSMap::STATE_CREATING; - break; - case MDSMap::STATE_FAILED: - default: - state = MDSMap::STATE_REPLAY; - break; - } - dout(10) << "mds_beacon boot: mds" << from - << " was " << MDSMap::get_state_name(mdsmap.get_state(from)) - << ", " << MDSMap::get_state_name(state) - << dendl; - } + from = -1; + + // standby for a given rank? + standby_for = m->get_want_rank(); + if (standby_for >= pending_mdsmap.max_mds) { + dout(10) << "mds_beacon boot: wanted standby for mds" << from + << " >= max_mds " << pending_mdsmap.max_mds + << ", will be shared standby" << dendl; + standby_for = -1; } - if (from < 0) { - from = pending_mdsmap.get_addr_rank(m->get_mds_inst().addr); - if (from >= 0) { - state = pending_mdsmap.mds_state[from]; - dout(10) << "mds_beacon boot: already pending mds" << from - << " " << MDSMap::get_state_name(state) << dendl; - delete m; - return false; + if (standby_for >= 0 && pending_mdsmap.is_down(standby_for)) { + // wants to be a specific MDS, who is down + from = standby_for; + switch (pending_mdsmap.get_state(standby_for)) { + case MDSMap::STATE_STOPPED: + state = MDSMap::STATE_STARTING; + break; + case MDSMap::STATE_DNE: + state = MDSMap::STATE_CREATING; + break; + case MDSMap::STATE_FAILED: + state = MDSMap::STATE_REPLAY; + break; + default: + assert(0); } + dout(10) << "mds_beacon boot: mds" << from + << " was " << MDSMap::get_state_name(pending_mdsmap.get_state(from)) + << ", " << MDSMap::get_state_name(state) + << dendl; } - if (from < 0) { - // pick a failed mds? + else if (standby_for < 0) { + // pick another failed mds? set failed; pending_mdsmap.get_failed_mds_set(failed); if (!failed.empty()) { @@ -289,71 +315,70 @@ bool MDSMonitor::handle_beacon(MMDSBeacon *m) state = MDSMap::STATE_REPLAY; } } - if (from < 0) { - // ok, just pick any unused mds id. - for (from=0; ; ++from) { - if (pending_mdsmap.is_dne(from)) { + if (from < 0 && standby_for < 0 && + !pending_mdsmap.is_degraded()) { + // ok, just pick any unused mds rank + // that doesn't make us overfull + for (int i=0; iget_mds_inst().addr; - pending_mdsmap.mds_inst[from].name = entity_name_t::MDS(from); - pending_mdsmap.mds_inc[from]++; - - // reset the beacon timer - last_beacon[from] = g_clock.now(); - - // if starting|creating and degraded|full, go to standby - if ((state == MDSMap::STATE_CREATING || state == MDSMap::STATE_STARTING) && - (pending_mdsmap.would_be_overfull_with(from) || - pending_mdsmap.is_degraded())) { - dout(10) << "mds_beacon cluster full, mds" << from << " will be standby" << dendl; + + if (from < 0) { + // standby + if (standby_for < 0) { + dout(10) << "mds_beacon boot: standby for any" << dendl; + pending_mdsmap.standby_any.insert(addr); + } else { + dout(10) << "mds_beacon boot: standby for mds" << standby_for << dendl; + pending_mdsmap.standby_for[standby_for].insert(addr); + } + pending_mdsmap.standby[addr] = standby_for; state = MDSMap::STATE_STANDBY; + } else { + // join|takeover + assert(state == MDSMap::STATE_CREATING || + state == MDSMap::STATE_STARTING || + state == MDSMap::STATE_REPLAY); + + pending_mdsmap.mds_inst[from].addr = addr; + pending_mdsmap.mds_inst[from].name = entity_name_t::MDS(from); + pending_mdsmap.mds_inc[from]++; + pending_mdsmap.mds_state[from] = state; + pending_mdsmap.mds_state_seq[from] = seq; } - } - // created? - if (state == MDSMap::STATE_ACTIVE && - mdsmap.is_creating(from)) { - pending_mdsmap.mds_created.insert(from); - dout(10) << "mds_beacon created mds" << from << dendl; - } - - // update the map - dout(10) << "mds_beacon mds" << from << " " << MDSMap::get_state_name(mdsmap.mds_state[from]) - << " -> " << MDSMap::get_state_name(state) - << dendl; + // initialize the beacon timer + last_beacon[addr] = g_clock.now(); - // has someone join or leave the cluster? - if (state == MDSMap::STATE_REPLAY || - state == MDSMap::STATE_ACTIVE || - state == MDSMap::STATE_STOPPED) { - pending_mdsmap.same_in_set_since = pending_mdsmap.epoch; + } else { + // state change + dout(10) << "mds_beacon mds" << from << " " << MDSMap::get_state_name(mdsmap.mds_state[from]) + << " -> " << MDSMap::get_state_name(state) + << dendl; + + // change the state + pending_mdsmap.mds_state[from] = state; + if (pending_mdsmap.is_up(from)) + pending_mdsmap.mds_state_seq[from] = seq; + else + pending_mdsmap.mds_state_seq.erase(from); } - - // change the state - pending_mdsmap.mds_state[from] = state; - if (pending_mdsmap.is_up(from)) - pending_mdsmap.mds_state_seq[from] = seq; - else - pending_mdsmap.mds_state_seq.erase(from); - + dout(7) << "pending map now:" << dendl; print_map(pending_mdsmap); - + paxos->wait_for_commit(new C_Updated(this, from, m)); return true; @@ -367,9 +392,9 @@ bool MDSMonitor::should_propose(double& delay) void MDSMonitor::_updated(int from, MMDSBeacon *m) { - if (m->get_state() == MDSMap::STATE_BOOT) { + if (from < 0) { dout(10) << "_updated (booted) mds" << from << " " << *m << dendl; - mon->osdmon->send_latest(mdsmap.get_inst(from)); + mon->osdmon->send_latest(m->get_source_inst()); } else { dout(10) << "_updated mds" << from << " " << *m << dendl; } @@ -385,41 +410,33 @@ void MDSMonitor::_updated(int from, MMDSBeacon *m) void MDSMonitor::committed() { // check for failed - set standby; set failed; - mdsmap.get_mds_set(standby, MDSMap::STATE_STANDBY); mdsmap.get_failed_mds_set(failed); - if (!standby.empty() && !failed.empty()) { - while (!standby.empty() && !failed.empty()) { - int f = *failed.begin(); - int t = *standby.begin(); - failed.erase(failed.begin()); - standby.erase(standby.begin()); - - dout(0) << "mds" << t << " taking over for mds" << f << dendl; - - // send new map to old inst/name - waiting_for_map.push_back(mdsmap.mds_inst[t]); + if (!mdsmap.standby.empty() && !failed.empty()) { + bool didtakeover = false; + set::iterator p = failed.begin(); + while (p != failed.end()) { + int f = *p++; - pending_mdsmap.mds_inst[f] = mdsmap.mds_inst[t]; - pending_mdsmap.mds_inst[f].name = entity_name_t::MDS(f); - pending_mdsmap.mds_inc[f]++; - pending_mdsmap.mds_state[f] = MDSMap::STATE_REPLAY; - pending_mdsmap.mds_state_seq[f] = mdsmap.mds_state_seq[t]; - - pending_mdsmap.mds_inst.erase(t); - pending_mdsmap.mds_state.erase(t); - pending_mdsmap.mds_state_seq.erase(t); - - last_beacon[f] = last_beacon[t]; - last_beacon.erase(t); + // someone standby for me? + if (mdsmap.standby_for.count(f) && + !mdsmap.standby_for[f].empty()) { + dout(0) << "mds" << f << " standby " << *mdsmap.standby_for[f].begin() << " taking over" << dendl; + take_over(*mdsmap.standby_for[f].begin(), f); + didtakeover = true; + } + else if (!mdsmap.standby_any.empty()) { + dout(0) << "standby " << mdsmap.standby.begin()->first << " taking over for mds" << f << dendl; + take_over(mdsmap.standby.begin()->first, f); + didtakeover = true; + } + } + if (didtakeover) { + dout(7) << "pending map now:" << dendl; + print_map(pending_mdsmap); + propose_pending(); } - - dout(7) << "pending map now:" << dendl; - print_map(pending_mdsmap); - - propose_pending(); } // hackish: did all mds's shut down? @@ -431,6 +448,27 @@ void MDSMonitor::committed() mon->monmap->get_inst(mon->whoami)); } +void MDSMonitor::take_over(entity_addr_t addr, int mds) +{ + pending_mdsmap.mds_inst[mds].addr = addr; + pending_mdsmap.mds_inst[mds].name = entity_name_t::MDS(mds); + pending_mdsmap.mds_inc[mds]++; + pending_mdsmap.mds_state[mds] = MDSMap::STATE_REPLAY; + pending_mdsmap.mds_state_seq[mds] = 0; + + // remove from standby list(s) + pending_mdsmap.standby.erase(addr); + pending_mdsmap.standby_for[mds].erase(addr); + pending_mdsmap.standby_any.erase(addr); + + // send new map to old inst/name + entity_inst_t oldinst; + oldinst.name = entity_name_t::MDS(-2); + oldinst.addr = addr; + waiting_for_map.push_back(oldinst); +} + + bool MDSMonitor::handle_command(MMonCommand *m) { @@ -449,10 +487,10 @@ bool MDSMonitor::handle_command(MMonCommand *m) ss << "mds" << who << " not active (" << mdsmap.get_state_name(mdsmap.get_state(who)) << ")"; } } - else if (m->cmd[1] == "set_target_num" && m->cmd.size() > 2) { - pending_mdsmap.target_num = atoi(m->cmd[2].c_str()); + else if (m->cmd[1] == "set_max_mds" && m->cmd.size() > 2) { + pending_mdsmap.max_mds = atoi(m->cmd[2].c_str()); r = 0; - ss << "target_num = " << pending_mdsmap.target_num; + ss << "max_mds = " << pending_mdsmap.max_mds; } } if (r == -EINVAL) { @@ -480,6 +518,16 @@ void MDSMonitor::bcast_latest_mds() p != up.end(); p++) send_full(mdsmap.get_inst(*p)); + + // standby too + entity_inst_t inst; + inst.name = entity_name_t::MDS(-1); + for (map::iterator p = mdsmap.standby.begin(); + p != mdsmap.standby.end(); + p++) { + inst.addr = p->first; + send_full(inst); + } } void MDSMonitor::send_full(entity_inst_t dest) @@ -510,76 +558,70 @@ void MDSMonitor::send_latest(entity_inst_t dest) void MDSMonitor::tick() { // make sure mds's are still alive - utime_t now = g_clock.now(); - // ...if i am an active leader if (!mon->is_leader()) return; if (!paxos->is_active()) return; - if (now > g_conf.mds_beacon_grace) { - utime_t cutoff = now; - cutoff -= g_conf.mds_beacon_grace; + utime_t cutoff = g_clock.now(); + cutoff -= g_conf.mds_beacon_grace; - bool changed = false; - - set up; - mdsmap.get_up_mds_set(up); - - for (set::iterator p = up.begin(); - p != up.end(); - ++p) { - if (last_beacon.count(*p)) { - if (last_beacon[*p] < cutoff) { - - // failure! - int newstate; - switch (mdsmap.get_state(*p)) { - case MDSMap::STATE_STANDBY: - if (mdsmap.has_created(*p)) - newstate = MDSMap::STATE_STOPPED; - else - newstate = MDSMap::STATE_DNE; - break; - - case MDSMap::STATE_CREATING: - // didn't finish creating - newstate = MDSMap::STATE_DNE; - break; - - case MDSMap::STATE_STARTING: - newstate = MDSMap::STATE_STOPPED; - break; - - case MDSMap::STATE_REPLAY: - case MDSMap::STATE_RESOLVE: - case MDSMap::STATE_RECONNECT: - case MDSMap::STATE_REJOIN: - case MDSMap::STATE_ACTIVE: - case MDSMap::STATE_STOPPING: - newstate = MDSMap::STATE_FAILED; - break; - - default: - assert(0); - } - - dout(10) << "no beacon from mds" << *p << " since " << last_beacon[*p] - << ", marking " << mdsmap.get_state_name(newstate) - << dendl; - - // update map - pending_mdsmap.mds_state[*p] = newstate; - pending_mdsmap.mds_state_seq.erase(*p); - changed = true; - } - } else { - dout(10) << "no beacons from mds" << *p << ", assuming one " << now << dendl; - last_beacon[*p] = now; + map::iterator p = last_beacon.begin(); + while (p != last_beacon.end()) { + entity_addr_t addr = p->first; + p++; + + if (last_beacon[addr] >= cutoff) continue; + + int mds = pending_mdsmap.get_addr_rank(addr); + if (mds >= 0) { + // failure! + int newstate; + switch (pending_mdsmap.get_state(mds)) { + case MDSMap::STATE_CREATING: + newstate = MDSMap::STATE_DNE; // didn't finish creating + break; + + case MDSMap::STATE_STARTING: + newstate = MDSMap::STATE_STOPPED; + break; + + case MDSMap::STATE_REPLAY: + case MDSMap::STATE_RESOLVE: + case MDSMap::STATE_RECONNECT: + case MDSMap::STATE_REJOIN: + case MDSMap::STATE_ACTIVE: + case MDSMap::STATE_STOPPING: + newstate = MDSMap::STATE_FAILED; + break; + + default: + assert(0); } + + dout(10) << "no beacon from mds" << *p << " since " << last_beacon[addr] + << ", marking " << pending_mdsmap.get_state_name(newstate) + << dendl; + + // update map + pending_mdsmap.mds_state[mds] = newstate; + pending_mdsmap.mds_state_seq.erase(mds); + } + else if (pending_mdsmap.is_standby(addr)) { + dout(10) << "no beacon from standby " << addr << " since " << last_beacon[addr] + << ", removing from standby list" + << dendl; + if (pending_mdsmap.standby[addr] >= 0) + pending_mdsmap.standby_for[pending_mdsmap.standby[addr]].erase(addr); + else + pending_mdsmap.standby_any.erase(addr); + pending_mdsmap.standby.erase(addr); + } + else { + dout(0) << "BUG: removing stray " << addr << " from last_beacon map" << dendl; } - if (changed) - propose_pending(); + last_beacon.erase(addr); + propose_pending(); } } @@ -605,7 +647,6 @@ void MDSMonitor::do_stop() pending_mdsmap.mds_state[p->first] = MDSMap::STATE_STOPPING; break; case MDSMap::STATE_CREATING: - case MDSMap::STATE_STANDBY: pending_mdsmap.mds_state[p->first] = MDSMap::STATE_DNE; break; case MDSMap::STATE_STARTING: @@ -620,6 +661,10 @@ void MDSMonitor::do_stop() break; } } + // hose standby list + pending_mdsmap.standby.clear(); + pending_mdsmap.standby_for.clear(); + pending_mdsmap.standby_any.clear(); propose_pending(); } diff --git a/trunk/ceph/mon/MDSMonitor.h b/trunk/ceph/mon/MDSMonitor.h index 4c8fc91abcbf7..c4dc095236501 100644 --- a/trunk/ceph/mon/MDSMonitor.h +++ b/trunk/ceph/mon/MDSMonitor.h @@ -73,8 +73,10 @@ class MDSMonitor : public PaxosService { bool handle_beacon(class MMDSBeacon *m); bool handle_command(class MMonCommand *m); + void take_over(entity_addr_t addr, int mds); + // beacons - map last_beacon; + map last_beacon; public: MDSMonitor(Monitor *mn, Paxos *p) : PaxosService(mn, p) { } -- 2.39.5