static const int STATE_STOPPING = CEPH_MDS_STATE_STOPPING; // up, exporting metadata (-> standby or out)
struct mds_info_t {
+ __u64 global_id;
string name;
int32_t rank;
int32_t inc;
string standby_for_name;
set<int32_t> export_targets;
- mds_info_t() : rank(-1), inc(0), state(STATE_STANDBY), state_seq(0) { }
+ mds_info_t() : global_id(0), rank(-1), inc(0), state(STATE_STANDBY), state_seq(0) { }
bool laggy() const { return !(laggy_since == utime_t()); }
void clear_laggy() { laggy_since = utime_t(); }
entity_inst_t get_inst() const { return entity_inst_t(entity_name_t::MDS(rank), addr); }
void encode(bufferlist& bl) const {
- __u8 v = 2;
+ __u8 v = 3;
::encode(v, bl);
+ ::encode(global_id, bl);
::encode(name, bl);
::encode(rank, bl);
::encode(inc, bl);
void decode(bufferlist::iterator& bl) {
__u8 v;
::decode(v, bl);
+ ::decode(global_id, bl);
::decode(name, bl);
::decode(rank, bl);
::decode(inc, bl);
set<int32_t> in; // currently defined cluster
map<int32_t,int32_t> inc; // most recent incarnation.
set<int32_t> failed, stopped; // which roles are failed or stopped
- map<int32_t,entity_addr_t> up; // who is in those roles
- map<entity_addr_t,mds_info_t> mds_info;
+ map<int32_t,__u64> up; // who is in those roles
+ map<__u64,mds_info_t> mds_info;
friend class MDSMonitor;
__u32 get_cas_pg_pool() const { return cas_pg_pool; }
__u32 get_metadata_pg_pool() const { return metadata_pg_pool; }
- const map<entity_addr_t,mds_info_t>& get_mds_info() { return mds_info; }
- const mds_info_t& get_mds_info(entity_addr_t a) {
- assert(mds_info.count(a));
- return mds_info[a];
+ const map<__u64,mds_info_t>& get_mds_info() { return mds_info; }
+ const mds_info_t& get_mds_info_gid(__u64 gid) {
+ assert(mds_info.count(gid));
+ return mds_info[gid];
}
const mds_info_t& get_mds_info(int m) {
assert(up.count(m) && mds_info.count(up[m]));
}
unsigned get_num_mds(int state) {
unsigned n = 0;
- for (map<entity_addr_t,mds_info_t>::const_iterator p = mds_info.begin();
+ for (map<__u64,mds_info_t>::const_iterator p = mds_info.begin();
p != mds_info.end();
p++)
if (p->second.state == state) ++n;
s = in;
}
void get_up_mds_set(set<int>& s) {
- for (map<int32_t,entity_addr_t>::const_iterator p = up.begin();
+ for (map<int32_t,__u64>::const_iterator p = up.begin();
p != up.end();
p++)
s.insert(p->first);
}
void get_recovery_mds_set(set<int>& s) {
s = failed;
- for (map<entity_addr_t,mds_info_t>::const_iterator p = mds_info.begin();
+ for (map<__u64,mds_info_t>::const_iterator p = mds_info.begin();
p != mds_info.end();
p++)
if (p->second.state >= STATE_REPLAY && p->second.state <= STATE_STOPPING)
s.insert(p->second.rank);
}
void get_mds_set(set<int>& s, int state) {
- for (map<entity_addr_t,mds_info_t>::const_iterator p = mds_info.begin();
+ for (map<__u64,mds_info_t>::const_iterator p = mds_info.begin();
p != mds_info.end();
p++)
if (p->second.state == state)
int get_random_up_mds() {
if (up.empty())
return -1;
- map<int32_t,entity_addr_t>::iterator p = up.begin();
- for (int n = rand() % up.size(); n; n--) p++;
+ map<int32_t,__u64>::iterator p = up.begin();
+ for (int n = rand() % up.size(); n; n--)
+ p++;
return p->first;
}
- bool find_standby_for(int mds, string& name, entity_addr_t &a) {
- for (map<entity_addr_t,mds_info_t>::const_iterator p = mds_info.begin();
+ __u64 find_standby_for(int mds, string& name) {
+ for (map<__u64,mds_info_t>::const_iterator p = mds_info.begin();
p != mds_info.end();
p++) {
if (p->second.rank == -1 &&
p->second.standby_for_name == name) &&
p->second.state == MDSMap::STATE_STANDBY &&
!p->second.laggy()) {
- a = p->second.addr;
- return true;
+ return p->first;
}
}
- for (map<entity_addr_t,mds_info_t>::const_iterator p = mds_info.begin();
+ for (map<__u64,mds_info_t>::const_iterator p = mds_info.begin();
p != mds_info.end();
p++) {
if (p->second.rank == -1 &&
p->second.standby_for_name.length() == 0 &&
p->second.state == MDSMap::STATE_STANDBY &&
!p->second.laggy()) {
- a = p->second.addr;
- return true;
+ return p->first;
}
}
- return false;
+ return 0;
}
// mds states
bool is_stopped(int m) { return stopped.count(m); }
bool is_dne(int m) { return in.count(m) == 0; }
- bool is_dne(entity_addr_t a) { return mds_info.count(a) == 0; }
+ bool is_dne_gid(__u64 gid) { return mds_info.count(gid) == 0; }
int get_state(int m) { return up.count(m) ? mds_info[up[m]].state : 0; }
- int get_state(entity_addr_t a) { return mds_info.count(a) ? mds_info[a].state : 0; }
- mds_info_t& get_info(entity_addr_t a) { assert(mds_info.count(a)); return mds_info[a]; }
+ int get_state_gid(__u64 gid) { return mds_info.count(gid) ? mds_info[gid].state : 0; }
+
+ mds_info_t& get_info(int m) { assert(up.count(m)); return mds_info[up[m]]; }
+ mds_info_t& get_info_gid(__u64 gid) { assert(mds_info.count(gid)); return mds_info[gid]; }
bool is_boot(int m) { return get_state(m) == STATE_BOOT; }
bool is_creating(int m) { return get_state(m) == STATE_CREATING; }
bool is_stopping(int m) { return get_state(m) == STATE_STOPPING; }
bool is_clientreplay_or_active_or_stopping(int m) { return is_clientreplay(m) || is_active(m) || is_stopping(m); }
- bool is_laggy(entity_addr_t a) { return mds_info.count(a) && mds_info[a].laggy(); }
+ bool is_laggy_gid(__u64 gid) { return mds_info.count(gid) && mds_info[gid].laggy(); }
// cluster states
return false;
}
- int get_rank(const entity_addr_t& addr) {
- if (mds_info.count(addr))
- return mds_info[addr].rank;
+ int get_rank_gid(__u64 gid) {
+ if (mds_info.count(gid))
+ return mds_info[gid].rank;
return -1;
}
{
entity_addr_t addr = m->get_orig_source_inst().addr;
int state = m->get_state();
+ __u64 gid = m->get_global_id();
version_t seq = m->get_seq();
MDSMap::mds_info_t info;
return false;
// booted, but not in map?
- if (pending_mdsmap.is_dne(addr)) {
+ if (pending_mdsmap.is_dne_gid(gid)) {
if (state != MDSMap::STATE_BOOT) {
dout(7) << "mds_beacon " << *m << " is not in mdsmap" << dendl;
mon->send_reply(m, new MMDSMap(mon->monmap->fsid, &mdsmap));
return false; // not booted yet.
}
}
- info = pending_mdsmap.get_info(addr);
+ info = pending_mdsmap.get_info_gid(gid);
// old seq?
if (info.state_seq > seq) {
ignore:
// note time and reply
dout(15) << "mds_beacon " << *m << " noting time and replying" << dendl;
- last_beacon[addr].stamp = g_clock.now();
- last_beacon[addr].seq = seq;
+ last_beacon[gid].stamp = g_clock.now();
+ last_beacon[gid].seq = seq;
mon->send_reply(m,
- new MMDSBeacon(mon->monmap->fsid, m->get_name(),
+ new MMDSBeacon(mon->monmap->fsid, m->get_global_id(), m->get_name(),
mdsmap.get_epoch(), state, seq));
// done
bool MDSMonitor::preprocess_offload_targets(MMDSLoadTargets* m)
{
dout(10) << "preprocess_offload_targets " << *m << " from " << m->get_orig_source() << dendl;
- const entity_addr_t& a = m->get_orig_source_addr();
- if (mdsmap.mds_info.count(a) &&
- m->targets == mdsmap.mds_info[a].export_targets)
+ __u64 gid = m->global_id;
+ if (mdsmap.mds_info.count(gid) &&
+ m->targets == mdsmap.mds_info[gid].export_targets)
return true;
return false;
}
// -- this is an update --
dout(12) << "prepare_beacon " << *m << " from " << m->get_orig_source_inst() << dendl;
entity_addr_t addr = m->get_orig_source_inst().addr;
+ __u64 gid = m->get_global_id();
int state = m->get_state();
version_t seq = m->get_seq();
// boot?
if (state == MDSMap::STATE_BOOT) {
// add
- MDSMap::mds_info_t& info = pending_mdsmap.mds_info[addr];
+ MDSMap::mds_info_t& info = pending_mdsmap.mds_info[gid];
+ info.global_id = gid;
info.name = m->get_name();
info.rank = -1;
info.addr = addr;
info.standby_for_name = m->get_standby_for_name();
// initialize the beacon timer
- last_beacon[addr].stamp = g_clock.now();
- last_beacon[addr].seq = seq;
+ last_beacon[gid].stamp = g_clock.now();
+ last_beacon[gid].seq = seq;
} else {
// state change
- MDSMap::mds_info_t& info = pending_mdsmap.get_info(addr);
+ MDSMap::mds_info_t& info = pending_mdsmap.get_info_gid(gid);
if (info.laggy()) {
dout(10) << "prepare_beacon clearly laggy flag on " << addr << dendl;
pending_mdsmap.up.erase(info.rank);
pending_mdsmap.in.erase(info.rank);
pending_mdsmap.stopped.insert(info.rank);
- pending_mdsmap.mds_info.erase(addr); // last! info is a ref into this map
- last_beacon.erase(addr);
+ pending_mdsmap.mds_info.erase(gid); // last! info is a ref into this map
+ last_beacon.erase(gid);
} else {
info.state = state;
info.state_seq = seq;
bool MDSMonitor::prepare_offload_targets(MMDSLoadTargets *m)
{
- const entity_addr_t& a = m->get_orig_source_addr();
- if (pending_mdsmap.mds_info.count(a)) {
- dout(10) << "prepare_offload_targets " << a << " " << m->targets << dendl;
- pending_mdsmap.mds_info[a].export_targets = m->targets;
+ __u64 gid = m->global_id;
+ if (pending_mdsmap.mds_info.count(gid)) {
+ dout(10) << "prepare_offload_targets " << gid << " " << m->targets << dendl;
+ pending_mdsmap.mds_info[gid].export_targets = m->targets;
} else {
- dout(10) << "prepare_offload_targets " << a << " not in map" << dendl;
+ dout(10) << "prepare_offload_targets " << gid << " not in map" << dendl;
}
return true;
}
int who = atoi(m->cmd[2].c_str());
if (mdsmap.is_active(who)) {
r = 0;
- entity_addr_t a = pending_mdsmap.up[who];
- ss << "telling mds" << who << " " << a << " to stop";
- pending_mdsmap.mds_info[a].state = MDSMap::STATE_STOPPING;
+ __u64 gid = pending_mdsmap.up[who];
+ ss << "telling mds" << who << " " << pending_mdsmap.mds_info[gid].addr << " to stop";
+ pending_mdsmap.mds_info[gid].state = MDSMap::STATE_STOPPING;
} else {
r = -EEXIST;
ss << "mds" << who << " not active ("
string name;
while (pending_mdsmap.is_in(mds))
mds++;
- entity_addr_t addr;
- if (!pending_mdsmap.find_standby_for(mds, name, addr))
+ __u64 newgid = pending_mdsmap.find_standby_for(mds, name);
+ if (!newgid)
break;
- dout(1) << "adding standby " << addr << " as mds" << mds << dendl;
+ MDSMap::mds_info_t& info = pending_mdsmap.mds_info[newgid];
+ dout(1) << "adding standby " << info.addr << " as mds" << mds << dendl;
- MDSMap::mds_info_t& info = pending_mdsmap.mds_info[addr];
info.rank = mds;
if (pending_mdsmap.stopped.count(mds)) {
info.state = MDSMap::STATE_STARTING;
info.state = MDSMap::STATE_CREATING;
info.inc = ++pending_mdsmap.inc[mds];
pending_mdsmap.in.insert(mds);
- pending_mdsmap.up[mds] = addr;
+ pending_mdsmap.up[mds] = newgid;
do_propose = true;
}
cutoff -= g_conf.mds_beacon_grace;
// make sure last_beacon is fully populated
- for (map<entity_addr_t,MDSMap::mds_info_t>::iterator p = pending_mdsmap.mds_info.begin();
+ for (map<__u64,MDSMap::mds_info_t>::iterator p = pending_mdsmap.mds_info.begin();
p != pending_mdsmap.mds_info.end();
++p)
- if (last_beacon.count(p->second.addr) == 0) {
+ if (last_beacon.count(p->first) == 0) {
const MDSMap::mds_info_t& info = p->second;
dout(10) << " adding " << p->second.addr << " mds" << info.rank << "." << info.inc
<< " " << ceph_mds_state_name(info.state)
<< " to last_beacon" << dendl;
- last_beacon[p->second.addr].stamp = g_clock.now();
- last_beacon[p->second.addr].seq = 0;
+ last_beacon[p->first].stamp = g_clock.now();
+ last_beacon[p->first].seq = 0;
}
if (mon->osdmon()->paxos->is_writeable()) {
bool propose_osdmap = false;
- map<entity_addr_t, beacon_info_t>::iterator p = last_beacon.begin();
+ map<__u64, beacon_info_t>::iterator p = last_beacon.begin();
while (p != last_beacon.end()) {
- entity_addr_t addr = p->first;
+ __u64 gid = p->first;
utime_t since = p->second.stamp;
__u64 seq = p->second.seq;
p++;
- if (pending_mdsmap.mds_info.count(addr) == 0) {
+ if (pending_mdsmap.mds_info.count(gid) == 0) {
// clean it out
- last_beacon.erase(addr);
+ last_beacon.erase(gid);
continue;
}
if (since >= cutoff)
continue;
- MDSMap::mds_info_t& info = pending_mdsmap.mds_info[addr];
+ MDSMap::mds_info_t& info = pending_mdsmap.mds_info[gid];
- dout(10) << "no beacon from " << addr << " mds" << info.rank << "." << info.inc
+ dout(10) << "no beacon from " << info.addr << " mds" << info.rank << "." << info.inc
<< " " << ceph_mds_state_name(info.state)
<< " since " << since << dendl;
// are we in?
// and is there a non-laggy standby that can take over for us?
- entity_addr_t sa;
+ __u64 sgid;
if (info.rank >= 0 &&
info.state != CEPH_MDS_STATE_STANDBY &&
- pending_mdsmap.find_standby_for(info.rank, info.name, sa)) {
- MDSMap::mds_info_t& si = pending_mdsmap.mds_info[sa];
- dout(10) << " replacing " << addr << " mds" << info.rank << "." << info.inc
+ (sgid = pending_mdsmap.find_standby_for(info.rank, info.name)) != 0) {
+ MDSMap::mds_info_t& si = pending_mdsmap.mds_info[sgid];
+ dout(10) << " replacing " << info.addr << " mds" << info.rank << "." << info.inc
<< " " << ceph_mds_state_name(info.state)
- << " with " << si.name << " " << sa << dendl;
+ << " with " << sgid << "/" << si.name << " " << info.addr << dendl;
switch (info.state) {
case MDSMap::STATE_CREATING:
case MDSMap::STATE_STARTING:
info.state_seq = seq;
if (si.state > 0) {
si.inc = ++pending_mdsmap.inc[info.rank];
- pending_mdsmap.up[info.rank] = sa;
+ pending_mdsmap.up[info.rank] = sgid;
pending_mdsmap.last_failure = pending_mdsmap.epoch;
}
- pending_mdsmap.mds_info.erase(addr);
+ pending_mdsmap.mds_info.erase(gid);
if (si.state > 0) {
// blacklist
utime_t until = now;
until += g_conf.mds_blacklist_interval;
- mon->osdmon()->blacklist(addr, until);
+ mon->osdmon()->blacklist(info.addr, until);
propose_osdmap = true;
}
do_propose = true;
} else if (info.state == MDSMap::STATE_STANDBY_REPLAY) {
- dout(10) << " failing " << addr << " mds" << info.rank << "." << info.inc
+ dout(10) << " failing " << info.addr << " mds" << info.rank << "." << info.inc
<< " " << ceph_mds_state_name(info.state)
<< dendl;
- pending_mdsmap.mds_info.erase(addr);
+ pending_mdsmap.mds_info.erase(gid);
do_propose = true;
} else if (!info.laggy()) {
// just mark laggy
- dout(10) << " marking " << addr << " mds" << info.rank << "." << info.inc
+ dout(10) << " marking " << info.addr << " mds" << info.rank << "." << info.inc
<< " " << ceph_mds_state_name(info.state)
<< " laggy" << dendl;
info.laggy_since = now;
do_propose = true;
}
- last_beacon.erase(addr);
+ last_beacon.erase(gid);
}
if (propose_osdmap)
set<int>::iterator p = failed.begin();
while (p != failed.end()) {
int f = *p++;
- entity_addr_t sa;
+ __u64 sgid;
string name; // FIXME
- if (pending_mdsmap.find_standby_for(f, name, sa)) {
- dout(0) << " taking over failed mds" << f << " with " << sa << dendl;
- MDSMap::mds_info_t& si = pending_mdsmap.mds_info[sa];
+ sgid = pending_mdsmap.find_standby_for(f, name);
+ if (sgid) {
+ MDSMap::mds_info_t& si = pending_mdsmap.mds_info[sgid];
+ dout(0) << " taking over failed mds" << f << " with " << sgid << "/" << si.name << " " << si.addr << dendl;
si.state = MDSMap::STATE_REPLAY;
si.rank = f;
si.inc = ++pending_mdsmap.inc[f];
pending_mdsmap.in.insert(f);
- pending_mdsmap.up[f] = sa;
+ pending_mdsmap.up[f] = sgid;
do_propose = true;
}
}
pending_mdsmap.get_num_mds(MDSMap::STATE_STANDBY) >= pending_mdsmap.get_num_mds()) {
// see which nodes are shadowed
set<int> shadowed;
- map<int, set<entity_addr_t> > avail;
- for (map<entity_addr_t,MDSMap::mds_info_t>::iterator p = pending_mdsmap.mds_info.begin();
+ map<int, set<__u64> > avail;
+ for (map<__u64,MDSMap::mds_info_t>::iterator p = pending_mdsmap.mds_info.begin();
p != pending_mdsmap.mds_info.end();
p++) {
if (p->second.state == MDSMap::STATE_STANDBY_REPLAY)
continue; // already shadowed.
if (pending_mdsmap.get_state(*p) < MDSMap::STATE_ACTIVE)
continue; // only shadow active mds
- entity_addr_t s;
+ __u64 sgid;
if (avail[*p].size()) {
- s = *avail[*p].begin();
+ sgid = *avail[*p].begin();
avail[*p].erase(avail[*p].begin());
} else if (avail[-1].size()) {
- s = *avail[-1].begin();
+ sgid = *avail[-1].begin();
avail[-1].erase(avail[-1].begin());
} else
continue;
- dout(10) << "mds" << *p << " will be shadowed by " << s << dendl;
+ dout(10) << "mds" << *p << " will be shadowed by " << sgid << dendl;
- MDSMap::mds_info_t& info = pending_mdsmap.mds_info[s];
+ MDSMap::mds_info_t& info = pending_mdsmap.mds_info[sgid];
info.rank = *p;
info.state = MDSMap::STATE_STANDBY_REPLAY;
do_propose = true;
dout(7) << "do_stop stopping active mds nodes" << dendl;
print_map(mdsmap);
- map<entity_addr_t,MDSMap::mds_info_t>::iterator p = pending_mdsmap.mds_info.begin();
+ map<__u64,MDSMap::mds_info_t>::iterator p = pending_mdsmap.mds_info.begin();
while (p != pending_mdsmap.mds_info.end()) {
+ __u64 gid = p->first;
MDSMap::mds_info_t& info = p->second;
p++;
switch (info.state) {
pending_mdsmap.stopped.insert(info.rank);
case MDSMap::STATE_CREATING:
pending_mdsmap.up.erase(info.rank);
- pending_mdsmap.mds_info.erase(info.addr);
+ pending_mdsmap.mds_info.erase(gid);
pending_mdsmap.in.erase(info.rank);
break;
case MDSMap::STATE_REPLAY:
// BUG: hrm, if this is the case, the STOPPING guys won't be able to stop, will they?
pending_mdsmap.failed.insert(info.rank);
pending_mdsmap.up.erase(info.rank);
- pending_mdsmap.mds_info.erase(info.addr);
+ pending_mdsmap.mds_info.erase(gid);
pending_mdsmap.in.erase(info.rank);
break;
}