We need a state value for standby mds's.
- what if a recovery is queued, or in progress, and the inode is then cowed? can that happen?
- proper handling of cache expire messages during rejoin phase?
-> i think cache expires are fine; the rejoin_ack handler just has to behave if rejoining items go missing
+- add up:standby-replay mode?
+ - tail the mds log as it is written
+ - periodically check head so that we trim, too
+ - a brute-force mode too that will replay from scratch?
- try_remove_unlinked_dn thing
- rename: importing inode... also journal imported client map?
- rerun destro trace against latest, with various journal lengths
#define CEPH_MDS_STATE_STANDBY -5 /* up, idle. waiting for assignment. */
#define CEPH_MDS_STATE_CREATING -6 /* up, creating MDS instance. */
#define CEPH_MDS_STATE_STARTING -7 /* up, starting previously stopped mds. */
+#define CEPH_MDS_STATE_STANDBY_REPLAY -8 /* up, tailing active node's journal. */
#define CEPH_MDS_STATE_REPLAY 8 /* up, replaying journal. */
#define CEPH_MDS_STATE_RESOLVE 9 /* up, disambiguating distributed
out << " mds" << *p << "." << mds_inc[*p]
<< " : " << get_state_name(get_state(*p));
if (have_inst(*p))
- out << " : " << get_inst(*p)
+ out << " : " << get_inst(*p).addr
<< (is_laggy(get_inst(*p).addr) ? " LAGGY" : "");
- if (standby_for.count(*p) && !standby_for[*p].empty())
- out << " : +" << standby_for[*p].size()
- << " standby " << standby_for[*p]
- << std::endl;
+ out << "\n";
+ if (standby_for.count(*p) && !standby_for[*p].empty()) {
+ //out << " : +" << standby_for[*p].size() << std::endl;
+ for (set<entity_addr_t>::iterator q = standby_for[*p].begin();
+ q != standby_for[*p].end();
+ q++)
+ out << " mds" << *p << ".? : " << get_state_name(standby[*q].state)
+ << " : " << *q << std::endl;
+ }
}
if (!standby_any.empty()) {
- out << " +" << standby_any.size() << " shared standby " << standby_any << std::endl;
+ for (set<entity_addr_t>::iterator q = standby_any.begin();
+ q != standby_any.end();
+ q++)
+ out << " mds?.? : " << get_state_name(standby[*q].state) << " : " << *q << std::endl;
}
}
static const int STATE_BOOT = CEPH_MDS_STATE_BOOT; // up, boot announcement. destiny unknown.
static const int STATE_STANDBY = CEPH_MDS_STATE_STANDBY; // up, idle. waiting for assignment by monitor.
+ static const int STATE_STANDBY_REPLAY = CEPH_MDS_STATE_STANDBY_REPLAY; // up, replaying active node; ready to take over.
static const int STATE_CREATING = CEPH_MDS_STATE_CREATING; // up, creating MDS instance (new journal, idalloc..).
static const int STATE_STARTING = CEPH_MDS_STATE_STARTING; // up, starting prior stopped MDS instance.
// up and out
case STATE_BOOT: return "up:boot";
case STATE_STANDBY: return "up:standby";
+ case STATE_STANDBY_REPLAY: return "up:standby-replay";
case STATE_CREATING: return "up:creating";
case STATE_STARTING: return "up:starting";
// up and in
return 0;
}
+ struct standby_t {
+ int32_t mds;
+ int32_t state;
+ void encode(bufferlist& bl) const {
+ ::encode(mds, bl);
+ ::encode(state, bl);
+ }
+ void decode(bufferlist::iterator& bl) {
+ ::decode(mds, bl);
+ ::decode(state, bl);
+ }
+ };
+ WRITE_CLASS_ENCODER(standby_t)
+
+
protected:
epoch_t epoch;
epoch_t client_epoch; // incremented only when change is significant to client.
map<int32_t,entity_inst_t> mds_inst; // up instances
map<int32_t,int32_t> mds_inc; // incarnation count (monotonically increases)
- map<entity_addr_t,int32_t> standby; // -1 == any
+ map<entity_addr_t,standby_t> standby; // -1 == any
map<int32_t, set<entity_addr_t> > standby_for;
set<entity_addr_t> standby_any;
bool is_stopped(int m) { return mds_state.count(m) && mds_state[m] == STATE_STOPPED; }
bool is_standby(entity_addr_t a) { return standby.count(a); }
+
+ int get_standby_for(entity_addr_t a) {
+ if (standby.count(a))
+ return standby[a].mds;
+ return -1;
+ }
+ int get_standby_state(entity_addr_t a) {
+ if (standby.count(a))
+ return standby[a].state;
+ return STATE_DNE;
+ }
// cluster states
bool is_full() {
void print(ostream& out);
void print_summary(ostream& out);
};
+WRITE_CLASS_ENCODER(MDSMap::standby_t)
inline ostream& operator<<(ostream& out, MDSMap& m) {
m.print_summary(out);
dout(10) << "mds_beacon boot: standby for mds" << standby_for << dendl;
pending_mdsmap.standby_for[standby_for].insert(addr);
}
- pending_mdsmap.standby[addr] = standby_for;
+ pending_mdsmap.standby[addr].mds = standby_for;
+ pending_mdsmap.standby[addr].state = MDSMap::STATE_STANDBY;
state = MDSMap::STATE_STANDBY;
} else {
// join|takeover
// standby too
entity_inst_t inst;
inst.name = entity_name_t::MDS(-1);
- for (map<entity_addr_t,int32_t>::iterator p = mdsmap.standby.begin();
+ for (map<entity_addr_t,MDSMap::standby_t>::iterator p = mdsmap.standby.begin();
p != mdsmap.standby.end();
p++) {
inst.addr = p->first;
mdsmap.get_state(p->first) != MDSMap::STATE_STOPPED &&
mdsmap.get_state(p->first) != MDSMap::STATE_FAILED)
last_beacon[p->second.addr] = g_clock.now();
- for (map<entity_addr_t,int32_t>::iterator p = mdsmap.standby.begin();
+ for (map<entity_addr_t,MDSMap::standby_t>::iterator p = mdsmap.standby.begin();
p != mdsmap.standby.end();
++p )
if (last_beacon.count(p->first) == 0)
dout(10) << "no beacon from standby " << addr << " since " << last_beacon[addr]
<< ", removing from standby list"
<< dendl;
- if (pending_mdsmap.standby[addr] >= 0)
- pending_mdsmap.standby_for[pending_mdsmap.standby[addr]].erase(addr);
+ if (pending_mdsmap.standby[addr].mds >= 0)
+ pending_mdsmap.standby_for[pending_mdsmap.standby[addr].mds].erase(addr);
else
pending_mdsmap.standby_any.erase(addr);
pending_mdsmap.standby.erase(addr);