From c32f67f9f5d80cbed9685be3eb23b23b281f6495 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 9 Dec 2008 16:34:54 -0800 Subject: [PATCH] mon: mark unresponsive mds laggy instead of failed until we can replace it This way we flag laggy mds's, but hold out until they come back online or we have a standby cmds to replace them. Should make things much more tolerable. --- src/include/ceph_fs.h | 8 ++++---- src/mds/MDSMap.cc | 32 ++++++++++++++++---------------- src/mds/MDSMap.h | 6 ++++++ src/mon/MDSMonitor.cc | 43 +++++++++++++++++++++++++++++++++---------- src/mon/MDSMonitor.h | 2 +- src/mon/mon_types.h | 2 +- 6 files changed, 61 insertions(+), 32 deletions(-) diff --git a/src/include/ceph_fs.h b/src/include/ceph_fs.h index 4eba54853c978..192fc49fc5e01 100644 --- a/src/include/ceph_fs.h +++ b/src/include/ceph_fs.h @@ -41,11 +41,11 @@ * client-facing protocol. */ #define CEPH_OSD_PROTOCOL 3 /* cluster internal */ -#define CEPH_MDS_PROTOCOL 2 /* cluster internal */ -#define CEPH_MON_PROTOCOL 2 /* cluster internal */ +#define CEPH_MDS_PROTOCOL 3 /* cluster internal */ +#define CEPH_MON_PROTOCOL 3 /* cluster internal */ #define CEPH_OSDC_PROTOCOL 3 /* public/client */ -#define CEPH_MDSC_PROTOCOL 2 /* public/client */ -#define CEPH_MONC_PROTOCOL 2 /* public/client */ +#define CEPH_MDSC_PROTOCOL 3 /* public/client */ +#define CEPH_MONC_PROTOCOL 3 /* public/client */ /* diff --git a/src/mds/MDSMap.cc b/src/mds/MDSMap.cc index e864cd86f1910..39ea1db3a8349 100644 --- a/src/mds/MDSMap.cc +++ b/src/mds/MDSMap.cc @@ -23,26 +23,21 @@ void MDSMap::print(ostream& out) out << "epoch " << get_epoch() << std::endl; out << "max_mds " << max_mds << std::endl; - entity_inst_t blank; set all; get_mds_set(all); for (set::iterator p = all.begin(); p != all.end(); ++p) { - if (standby_for.count(*p) && !standby_for[*p].empty()) { - out << " mds" << *p << "." << mds_inc[*p] - << " : " << get_state_name(get_state(*p)) - << " : " << (have_inst(*p) ? get_inst(*p) : blank) - << " : +" << standby_for[*p].size() + out << " mds" << *p << "." << mds_inc[*p] + << " : " << get_state_name(get_state(*p)); + if (have_inst(*p)) + out << " : " << get_inst(*p) + << (is_laggy(get_inst(*p).addr) ? " LAGGY" : ""); + if (standby_for.count(*p) && !standby_for[*p].empty()) + out << " : +" << standby_for[*p].size() << " standby " << standby_for[*p] << std::endl; - } else { - out << " mds" << *p << "." << mds_inc[*p] - << " : " << get_state_name(get_state(*p)) - << " : " << (have_inst(*p) ? get_inst(*p) : blank) - << std::endl; - } } if (!standby_any.empty()) { out << " +" << standby_any.size() << " shared standby " << standby_any << std::endl; @@ -59,19 +54,24 @@ void MDSMap::print_summary(ostream& out) get_mds_set(all); int standby_spec = 0; - map by_state; + map by_state; for (set::iterator p = all.begin(); p != all.end(); ++p) { - by_state[get_state(*p)]++; + string s = get_state_name(get_state(*p)); + if (laggy.count(get_inst(*p).addr)) + s += "(laggy)"; + by_state[s]++; standby_spec += get_num_standby_for(*p); } - for (map::iterator p = by_state.begin(); p != by_state.end(); p++) { + for (map::iterator p = by_state.begin(); p != by_state.end(); p++) { if (p != by_state.begin()) ss << ", "; - ss << p->second << " " << MDSMap::get_state_name(p->first); + ss << p->second << " " << p->first; } + if (laggy.size()) + ss << ", " << laggy.size() << " laggy"; if (get_num_standby_any()) ss << ", " << get_num_standby_any() << " standby (any)"; if (standby_spec) diff --git a/src/mds/MDSMap.h b/src/mds/MDSMap.h index 0b042ff824dad..a42ddccc53c5a 100644 --- a/src/mds/MDSMap.h +++ b/src/mds/MDSMap.h @@ -118,6 +118,8 @@ class MDSMap { map standby; // -1 == any map > standby_for; set standby_any; + + set laggy; friend class MDSMonitor; @@ -146,6 +148,8 @@ class MDSMap { int get_tableserver() const { return tableserver; } int get_root() const { return root; } + bool is_laggy(entity_addr_t a) const { return laggy.count(a); } + // counts int get_num_mds() { return get_num_in_mds(); @@ -363,6 +367,7 @@ class MDSMap { ::encode(standby, bl); ::encode(standby_for, bl); ::encode(standby_any, bl); + ::encode(laggy, bl); } void decode(bufferlist& bl) { @@ -383,6 +388,7 @@ class MDSMap { ::decode(standby, p); ::decode(standby_for, p); ::decode(standby_any, p); + ::decode(laggy, p); } diff --git a/src/mon/MDSMonitor.cc b/src/mon/MDSMonitor.cc index 99e69203d7b79..30a03e01508af 100644 --- a/src/mon/MDSMonitor.cc +++ b/src/mon/MDSMonitor.cc @@ -173,9 +173,13 @@ bool MDSMonitor::preprocess_beacon(MMDSBeacon *m) return false; // can i handle this query without a map update? - + + // no longer laggy? + if (pending_mdsmap.laggy.count(addr)) { + return false; // need to update map. + } // boot? - if (state == MDSMap::STATE_BOOT) { + else if (state == MDSMap::STATE_BOOT) { // already booted? if (pending_mdsmap.get_addr_rank(addr) == -1) return false; // not booted|booting|standby yet @@ -233,7 +237,7 @@ bool MDSMonitor::prepare_update(Message *m) switch (m->get_type()) { case MSG_MDS_BEACON: - return handle_beacon((MMDSBeacon*)m); + return prepare_beacon((MMDSBeacon*)m); case MSG_MON_COMMAND: return prepare_command((MMonCommand*)m); @@ -248,7 +252,7 @@ bool MDSMonitor::prepare_update(Message *m) -bool MDSMonitor::handle_beacon(MMDSBeacon *m) +bool MDSMonitor::prepare_beacon(MMDSBeacon *m) { // -- this is an update -- dout(12) << "handle_beacon " << *m @@ -259,6 +263,11 @@ bool MDSMonitor::handle_beacon(MMDSBeacon *m) int state = m->get_state(); version_t seq = m->get_seq(); + if (pending_mdsmap.laggy.count(addr)) { + dout(10) << "prepare_beacon clearly laggy flag on " << addr << dendl; + pending_mdsmap.laggy.erase(addr); + } + // boot? int standby_for = -1; if (state == MDSMap::STATE_BOOT) { @@ -662,9 +671,21 @@ void MDSMonitor::tick() entity_addr_t addr = p->first; p++; - if (last_beacon[addr] >= cutoff) continue; + if (last_beacon[addr] >= cutoff) + continue; int mds = pending_mdsmap.get_addr_rank(addr); + + if ((mds < 0 || pending_mdsmap.standby_for.count(mds) == 0) && + pending_mdsmap.standby_any.empty()) { + // laggy! + dout(10) << "no beacon from mds" << mds << " " << *p << " since " << last_beacon[addr] + << ", marking laggy" << dendl; + pending_mdsmap.laggy.insert(addr); + do_propose = true; + continue; + } + if (mds >= 0) { // failure! int curstate = pending_mdsmap.get_state(mds); @@ -675,14 +696,14 @@ void MDSMonitor::tick() newstate = MDSMap::STATE_DNE; // didn't finish creating last_beacon.erase(addr); break; - + case MDSMap::STATE_STARTING: newstate = MDSMap::STATE_STOPPED; break; - + case MDSMap::STATE_STOPPED: break; - + case MDSMap::STATE_REPLAY: case MDSMap::STATE_RESOLVE: case MDSMap::STATE_RECONNECT: @@ -693,11 +714,11 @@ void MDSMonitor::tick() newstate = MDSMap::STATE_FAILED; pending_mdsmap.last_failure = pending_mdsmap.epoch; break; - + default: assert(0); } - + dout(10) << "no beacon from mds" << mds << " " << *p << " since " << last_beacon[addr] << ", marking " << pending_mdsmap.get_state_name(newstate) << dendl; @@ -705,6 +726,7 @@ void MDSMonitor::tick() // update map pending_mdsmap.mds_state[mds] = newstate; pending_mdsmap.mds_state_seq.erase(mds); + pending_mdsmap.laggy.erase(addr); } else if (pending_mdsmap.is_standby(addr)) { dout(10) << "no beacon from standby " << addr << " since " << last_beacon[addr] @@ -715,6 +737,7 @@ void MDSMonitor::tick() else pending_mdsmap.standby_any.erase(addr); pending_mdsmap.standby.erase(addr); + pending_mdsmap.laggy.erase(addr); } else { dout(0) << "BUG: removing stray " << addr << " from last_beacon map" << dendl; diff --git a/src/mon/MDSMonitor.h b/src/mon/MDSMonitor.h index 075f42add26f9..a0fb17133aab7 100644 --- a/src/mon/MDSMonitor.h +++ b/src/mon/MDSMonitor.h @@ -75,7 +75,7 @@ class MDSMonitor : public PaxosService { void committed(); bool preprocess_beacon(class MMDSBeacon *m); - bool handle_beacon(class MMDSBeacon *m); + bool prepare_beacon(class MMDSBeacon *m); void handle_mds_getmap(MMDSGetMap *m); void take_over(entity_addr_t addr, int mds); diff --git a/src/mon/mon_types.h b/src/mon/mon_types.h index f404fa1b09849..500082d4f95ae 100644 --- a/src/mon/mon_types.h +++ b/src/mon/mon_types.h @@ -33,6 +33,6 @@ inline const char *get_paxos_name(int p) { } } -#define CEPH_MON_ONDISK_MAGIC "ceph mon volume v007" +#define CEPH_MON_ONDISK_MAGIC "ceph mon volume v008" #endif -- 2.39.5