From 11189debdb8f47805f8ed5bb4ab9a990e353f381 Mon Sep 17 00:00:00 2001 From: sageweil Date: Thu, 25 Jan 2007 19:45:09 +0000 Subject: [PATCH] mds state beacon working (monitor doesn't detect failures yet) git-svn-id: https://ceph.svn.sf.net/svnroot/ceph@1035 29311d96-e01e-0410-9327-a35deaab8ce9 --- branches/sage/cephmds2/TODO | 5 +- branches/sage/cephmds2/common/Timer.cc | 2 +- branches/sage/cephmds2/config.cc | 3 + branches/sage/cephmds2/config.h | 3 + branches/sage/cephmds2/mds/MDCache.cc | 2 +- branches/sage/cephmds2/mds/MDS.cc | 198 ++++++++++++--- branches/sage/cephmds2/mds/MDS.h | 23 +- branches/sage/cephmds2/mds/MDSMap.h | 46 +++- branches/sage/cephmds2/messages/MMDSBeacon.h | 54 ++++ branches/sage/cephmds2/mon/MDSMonitor.cc | 245 +++++++++++++++---- branches/sage/cephmds2/mon/MDSMonitor.h | 15 +- branches/sage/cephmds2/mon/Monitor.cc | 2 +- branches/sage/cephmds2/mon/OSDMonitor.cc | 2 +- branches/sage/cephmds2/msg/Message.cc | 6 +- branches/sage/cephmds2/msg/Message.h | 8 +- 15 files changed, 500 insertions(+), 114 deletions(-) create mode 100644 branches/sage/cephmds2/messages/MMDSBeacon.h diff --git a/branches/sage/cephmds2/TODO b/branches/sage/cephmds2/TODO index 0700bfeb0cb83..3c5c9cb2b0ff8 100644 --- a/branches/sage/cephmds2/TODO +++ b/branches/sage/cephmds2/TODO @@ -1,9 +1,12 @@ blech: -- metablob shoudl mark explicitly dirtied items, and return 'expired' if they have +- EMetablob should return 'expired' if they have higher versions (and are thus described by a newer journal entry) + - mds failure detection, failure decision by monitor. - keepalive beacons! +- mds state changes for shutdown + mds - journal+recovery diff --git a/branches/sage/cephmds2/common/Timer.cc b/branches/sage/cephmds2/common/Timer.cc index d70259c3e0a08..51cb2e2883909 100644 --- a/branches/sage/cephmds2/common/Timer.cc +++ b/branches/sage/cephmds2/common/Timer.cc @@ -21,7 +21,7 @@ #include "include/Context.h" #undef dout -#define dout(x) if (x <= g_conf.debug) cout << "Timer: " +#define dout(x) if (x <= g_conf.debug) cout << g_clock.now() << " TIMER " #define DBL 10 diff --git a/branches/sage/cephmds2/config.cc b/branches/sage/cephmds2/config.cc index f01bf3e76b389..4542d153dd493 100644 --- a/branches/sage/cephmds2/config.cc +++ b/branches/sage/cephmds2/config.cc @@ -157,6 +157,9 @@ md_config_t g_conf = { mds_decay_halflife: 30, + mds_beacon_interval: 5.0, + mds_beacon_grace: 15.0, + mds_log: true, mds_log_max_len: MDS_CACHE_SIZE / 3, mds_log_max_trimming: 10000, diff --git a/branches/sage/cephmds2/config.h b/branches/sage/cephmds2/config.h index 5ca21583cb7ac..b811f1e6fdb7a 100644 --- a/branches/sage/cephmds2/config.h +++ b/branches/sage/cephmds2/config.h @@ -145,6 +145,9 @@ struct md_config_t { float mds_decay_halflife; + float mds_beacon_interval; + float mds_beacon_grace; + bool mds_log; int mds_log_max_len; int mds_log_max_trimming; diff --git a/branches/sage/cephmds2/mds/MDCache.cc b/branches/sage/cephmds2/mds/MDCache.cc index a6ee1e46c466a..88cc5bd14ce68 100644 --- a/branches/sage/cephmds2/mds/MDCache.cc +++ b/branches/sage/cephmds2/mds/MDCache.cc @@ -504,7 +504,7 @@ bool MDCache::shutdown_pass() { dout(7) << "shutdown_pass" << endl; //assert(mds->is_shutting_down()); - if (mds->is_down()) { + if (mds->is_out()) { dout(7) << " already shut down" << endl; show_cache(); show_imports(); diff --git a/branches/sage/cephmds2/mds/MDS.cc b/branches/sage/cephmds2/mds/MDS.cc index 73387610a62b6..aa5194ca385c1 100644 --- a/branches/sage/cephmds2/mds/MDS.cc +++ b/branches/sage/cephmds2/mds/MDS.cc @@ -44,7 +44,7 @@ #include "common/Timer.h" #include "messages/MMDSMap.h" -#include "messages/MMDSBoot.h" +#include "messages/MMDSBeacon.h" #include "messages/MPing.h" #include "messages/MPingAck.h" @@ -91,10 +91,17 @@ MDS::MDS(int whoami, Messenger *m, MonMap *mm) { server = new Server(this); locker = new Locker(this, mdcache); + + // beacon + beacon_last_seq = 0; + beacon_sender = 0; + beacon_killer = 0; + req_rate = 0; - state = MDSMap::STATE_DOWN; // booting + state = MDSMap::STATE_OUT; + want_state = MDSMap::STATE_STARTING; logger = logger2 = 0; @@ -208,10 +215,9 @@ public: int MDS::init() { - // request osd map - dout(5) << "requesting mds and osd maps from mon" << endl; - int mon = monmap->pick_mon(); - messenger->send_message(new MMDSBoot, MSG_ADDR_MON(mon), monmap->get_inst(mon)); + // starting beacon. this will induce an MDSMap from the monitor + state = MDSMap::STATE_STARTING; + beacon_start(); // schedule tick g_timer.add_event_after(1.0, new C_MDS_Tick(this)); @@ -241,7 +247,7 @@ void MDS::tick() } // booted? - if (!is_booting()) { + if (is_active()) { // balancer balancer->tick(); @@ -274,44 +280,169 @@ void MDS::tick() mds_lock.Unlock(); } + + + +// ----------------------- +// beacons + +void MDS::beacon_start() +{ + beacon_send(0); // send first beacon + reset_beacon_killer(); // schedule killer +} + + +class C_MDS_BeaconSender : public Context { + MDS *mds; +public: + C_MDS_BeaconSender(MDS *m) : mds(m) {} + void finish(int r) { + mds->mds_lock.Lock(); + mds->beacon_send(this); + mds->mds_lock.Unlock(); + } +}; + +void MDS::beacon_send(Context *c) +{ + if (c && c != beacon_sender) { + derr(0) << "beacon_send beacon_sender doesn't match, noop" << endl; + return; + } + + ++beacon_last_seq; + dout(10) << "beacon_send " << MDSMap::get_state_name(want_state) + << " seq " << beacon_last_seq << endl; + + beacon_seq_stamp[beacon_last_seq] = g_clock.now(); + + int mon = monmap->pick_mon(); + messenger->send_message(new MMDSBeacon(want_state, beacon_last_seq), + MSG_ADDR_MON(mon), monmap->get_inst(mon)); + + // schedule next sender + beacon_sender = new C_MDS_BeaconSender(this); + g_timer.add_event_after(g_conf.mds_beacon_interval, beacon_sender); +} + +void MDS::handle_mds_beacon(MMDSBeacon *m) +{ + dout(10) << "handle_mds_beacon " << MDSMap::get_state_name(m->get_state()) + << " seq " << m->get_seq() << endl; + version_t seq = m->get_seq(); + + // update lab + if (beacon_seq_stamp.count(seq)) { + assert(beacon_seq_stamp[seq] > beacon_last_acked_stamp); + beacon_last_acked_stamp = beacon_seq_stamp[seq]; + + // clean up seq_stamp map + while (!beacon_seq_stamp.empty() && + beacon_seq_stamp.begin()->first <= seq) + beacon_seq_stamp.erase(beacon_seq_stamp.begin()); + + reset_beacon_killer(); + } + + delete m; +} + +class C_MDS_BeaconKiller : public Context { + MDS *mds; + utime_t lab; +public: + C_MDS_BeaconKiller(MDS *m, utime_t l) : mds(m), lab(l) {} + void finish(int r) { + mds->beacon_kill(lab); + } +}; + +void MDS::reset_beacon_killer() +{ + utime_t when = beacon_last_acked_stamp; + when += g_conf.mds_beacon_grace; + + dout(15) << "reset_beacon_killer last_acked_stamp at " << beacon_last_acked_stamp + << ", will die at " << when << endl; + + if (beacon_killer) + g_timer.cancel_event(beacon_killer); + + beacon_killer = new C_MDS_BeaconKiller(this, beacon_last_acked_stamp); + g_timer.add_event_at(when, beacon_killer); +} + +void MDS::beacon_kill(utime_t lab) +{ + if (lab == beacon_last_acked_stamp) { + dout(0) << "beacon_kill last_acked_stamp " << lab + << ", killing myself." + << endl; + exit(0); + } else { + dout(20) << "beacon_kill last_acked_stamp " << beacon_last_acked_stamp + << " != my " << lab + << ", doing nothing." + << endl; + } +} + + + void MDS::handle_mds_map(MMDSMap *m) { - map::reverse_iterator p = m->maps.rbegin(); + map::reverse_iterator p = m->maps.rbegin(); // fixme multiple maps? dout(1) << "handle_mds_map epoch " << p->first << endl; mdsmap->decode(p->second); - delete m; - // see who i am - int w = mdsmap->get_inst_rank(messenger->get_myinst()); - if (w != whoami) { - whoami = w; - messenger->reset_myaddr(MSG_ADDR_MDS(w)); + int oldwhoami = whoami; + whoami = mdsmap->get_inst_rank(messenger->get_myinst()); + if (oldwhoami != whoami) { + messenger->reset_myaddr(MSG_ADDR_MDS(whoami)); reopen_log(); } - dout(1) << "map says i am " << w << endl; - if (is_booting()) { + // update my state + int oldstate = state; + state = mdsmap->get_state(whoami); + + if (oldstate == MDSMap::STATE_OUT && state == MDSMap::STATE_CREATING) { + // special case at startup (monitor decides whether i am creating or starting) + assert(want_state == MDSMap::STATE_STARTING); + want_state = MDSMap::STATE_CREATING; + } + + dout(1) << "handle_mds_map i am mds" << whoami << " with state " << mdsmap->get_state_name(state) << endl; + + // do i need an osdmap? + if (oldwhoami < 0) { // we need an osdmap too. int mon = monmap->pick_mon(); messenger->send_message(new MOSDGetMap(0), MSG_ADDR_MON(mon), monmap->get_inst(mon)); } + + delete m; } void MDS::handle_osd_map(MOSDMap *m) { + version_t had = osdmap->get_epoch(); + // process locally objecter->handle_osd_map(m); - - if (is_booting()) { - // we got our maps. mkfs for recovery? - if (g_conf.mkfs) - boot_mkfs(); - else + + if (had == 0) { + if (is_starting()) boot_recover(); - } + else if (is_creating()) + boot_mkfs(); + else + assert(0); + } // pass on to clients for (set::iterator it = clientmap.get_mount_set().begin(); @@ -350,7 +481,7 @@ void MDS::boot_mkfs() CDir *dir = root->dir; dir->mark_complete(); dir->mark_dirty(dir->pre_dirty()); - + // save it mdstore->commit_dir(dir, fin->new_sub()); } @@ -393,9 +524,6 @@ public: void MDS::boot_recover(int step) { - if (is_booting()) - state = MDSMap::STATE_STARTING; - switch (step) { case 0: /* no, EImportMap takes care of all this. @@ -455,6 +583,14 @@ void MDS::mark_active() dout(3) << "mark_active" << endl; state = MDSMap::STATE_ACTIVE; finish_contexts(waitfor_active); // kick waiters + + beacon_send(); + + /* + int who = monmap->pick_mon(); + messenger->send_message(new MMDSState(state), + MSG_ADDR_MON(who), monmap->get_inst(who)); + */ } @@ -511,7 +647,7 @@ int MDS::shutdown_final() { dout(1) << "shutdown" << endl; - state = MDSMap::STATE_DOWN; + state = MDSMap::STATE_OUT; // shut down cache mdcache->shutdown(); @@ -682,12 +818,14 @@ void MDS::proc_message(Message *m) handle_mds_map((MMDSMap*)m); return; + case MSG_MDS_BEACON: + handle_mds_beacon((MMDSBeacon*)m); + return; + case MSG_MDS_SHUTDOWNSTART: // mds0 -> mds1+ handle_shutdown_start(m); return; - - case MSG_PING: handle_ping((MPing*)m); return; diff --git a/branches/sage/cephmds2/mds/MDS.h b/branches/sage/cephmds2/mds/MDS.h index d20e210612116..b80c07f9f5c08 100644 --- a/branches/sage/cephmds2/mds/MDS.h +++ b/branches/sage/cephmds2/mds/MDS.h @@ -94,7 +94,7 @@ class MClientReply; class MHashReaddir; class MHashReaddirReply; - +class MMDSBeacon; class MDS : public Dispatcher { @@ -133,14 +133,15 @@ class MDS : public Dispatcher { protected: // -- MDS state -- - int state; + int state; // my confirmed state + int want_state; // the state i want list waitfor_active; public: void queue_waitfor_active(Context *c) { waitfor_active.push_back(c); } - bool is_down() { return state == MDSMap::STATE_DOWN; } - bool is_booting() { return state == MDSMap::STATE_DOWN; } + bool is_out() { return state == MDSMap::STATE_OUT; } + bool is_failed() { return state == MDSMap::STATE_FAILED; } bool is_creating() { return state == MDSMap::STATE_CREATING; } bool is_standby() { return state == MDSMap::STATE_STANDBY; } bool is_starting() { return state == MDSMap::STATE_STARTING; } @@ -160,6 +161,12 @@ public: finished_queue.splice( finished_queue.end(), ls ); } + // -- keepalive beacon -- + version_t beacon_last_seq; // last seq sent to monitor + map beacon_seq_stamp; // seq # -> time sent + utime_t beacon_last_acked_stamp; // last time we sent a beacon that got acked + Context *beacon_sender; + Context *beacon_killer; // next scheduled time of death // shutdown crap @@ -174,7 +181,7 @@ public: friend class MDStore; - + public: MDS(int whoami, Messenger *m, MonMap *mm); ~MDS(); @@ -198,6 +205,12 @@ public: int shutdown_final(); void tick(); + + void beacon_start(); + void beacon_send(Context *c=0); + void beacon_kill(utime_t lab); + void handle_mds_beacon(MMDSBeacon *m); + void reset_beacon_killer(); // messages void proc_message(Message *m); diff --git a/branches/sage/cephmds2/mds/MDSMap.h b/branches/sage/cephmds2/mds/MDSMap.h index 7ee59a025c874..89f8012cb15c2 100644 --- a/branches/sage/cephmds2/mds/MDSMap.h +++ b/branches/sage/cephmds2/mds/MDSMap.h @@ -27,15 +27,30 @@ using namespace std; class MDSMap { public: - static const int STATE_OUT = 0; // offline, no metadata. need not exist in map at all. - static const int STATE_DOWN = 1; // offline, holds (er, held) metadata; needs to be recovered. - - static const int STATE_STANDBY = 2; // online, but inactive; waiting for someone to fail. - static const int STATE_CREATING = 3; // online, creating MDS instance (initializing journal, etc.) - static const int STATE_STARTING = 4; // online, scanning journal, recoverying any shared state - static const int STATE_ACTIVE = 5; // online, active - static const int STATE_STOPPING = 6; // online, exporting metadata (-> standby or out) + static const int STATE_OUT = 0; // down, no active metadata. + static const int STATE_FAILED = 1; // down, holds (er, held) metadata; needs to be recovered. + + static const int STATE_STANDBY = 2; // up, but inactive; waiting for someone to fail. + static const int STATE_CREATING = 3; // up, creating MDS instance (initializing journal, etc.) + static const int STATE_STARTING = 4; // up, scanning journal, recoverying any shared state + static const int STATE_ACTIVE = 5; // up, active + static const int STATE_STOPPING = 6; // up, exporting metadata (-> standby or out) + static const char *get_state_name(int s) { + switch (s) { + // down + case STATE_OUT: return "down:out"; + case STATE_FAILED: return "down:failed"; + // up + case STATE_STANDBY: return "up:standby"; + case STATE_CREATING: return "up:creating"; + case STATE_STARTING: return "up:starting"; + case STATE_ACTIVE: return "up:active"; + case STATE_STOPPING: return "up:stopping"; + default: assert(0); + } + } + protected: epoch_t epoch; utime_t ctime; @@ -45,7 +60,8 @@ class MDSMap { set mds_set; // set of MDSs map mds_state; // MDS state - map mds_inst; // online (non-out, non-down) instances + map mds_state_seq; + map mds_inst; // up instances friend class MDSMonitor; @@ -64,14 +80,22 @@ class MDSMap { const set& get_mds_set() const { return mds_set; } - bool is_out(int m) { return mds_state.count(m) == 0 && mds_state[m] == STATE_OUT; } - bool is_down(int m) { return mds_state.count(m) && mds_state[m] == STATE_DOWN; } + bool is_down(int m) { return is_out(m) || is_failed(m); } + bool is_up(int m) { return !is_down(m); } + + bool is_out(int m) { return mds_state.count(m) == 0 || mds_state[m] == STATE_OUT; } + bool is_failed(int m) { return mds_state.count(m) && mds_state[m] == STATE_FAILED; } bool is_standby(int m) { return mds_state.count(m) && mds_state[m] == STATE_STANDBY; } bool is_creating(int m) { return mds_state.count(m) && mds_state[m] == STATE_CREATING; } bool is_starting(int m) { return mds_state.count(m) && mds_state[m] == STATE_STARTING; } bool is_active(int m) { return mds_state.count(m) && mds_state[m] == STATE_ACTIVE; } bool is_stopping(int m) { return mds_state.count(m) && mds_state[m] == STATE_STOPPING; } + int get_state(int m) { + if (mds_state.count(m)) return mds_state[m]; + return STATE_OUT; + } + const entity_inst_t& get_inst(int m) { assert(mds_inst.count(m)); return mds_inst[m]; diff --git a/branches/sage/cephmds2/messages/MMDSBeacon.h b/branches/sage/cephmds2/messages/MMDSBeacon.h new file mode 100644 index 0000000000000..86eccc689d396 --- /dev/null +++ b/branches/sage/cephmds2/messages/MMDSBeacon.h @@ -0,0 +1,54 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef __MMDSBEACON_H +#define __MMDSBEACON_H + +#include "msg/Message.h" + +#include "include/types.h" + +#include "mds/MDSMap.h" + +class MMDSBeacon : public Message { + int state; + version_t seq; + + public: + MMDSBeacon() : Message(MSG_MDS_BEACON) {} + MMDSBeacon(int st, version_t se) : Message(MSG_MDS_BEACON), + state(st), seq(se) { } + + int get_state() { return state; } + version_t get_seq() { return seq; } + char *get_type_name() { return "mdsbeacon"; } + + void print(ostream& out) { + out << "mdsbeacon(" << MDSMap::get_state_name(state) + << " seq " << seq << ")"; + } + + void encode_payload() { + payload.append((char*)&state, sizeof(state)); + payload.append((char*)&seq, sizeof(seq)); + } + void decode_payload() { + int off = 0; + payload.copy(off, sizeof(state), (char*)&state); + off += sizeof(state); + payload.copy(off, sizeof(seq), (char*)&seq); + off += sizeof(seq); + } +}; + +#endif diff --git a/branches/sage/cephmds2/mon/MDSMonitor.cc b/branches/sage/cephmds2/mon/MDSMonitor.cc index 7e1accbf942f7..2acb4bf97cc16 100644 --- a/branches/sage/cephmds2/mon/MDSMonitor.cc +++ b/branches/sage/cephmds2/mon/MDSMonitor.cc @@ -15,10 +15,9 @@ #include "MDSMonitor.h" #include "Monitor.h" -#include "messages/MMDSBoot.h" #include "messages/MMDSMap.h" #include "messages/MMDSGetMap.h" -//#include "messages/MMDSFailure.h" +#include "messages/MMDSBeacon.h" #include "common/Timer.h" @@ -35,91 +34,233 @@ void MDSMonitor::create_initial() { mdsmap.epoch = 0; // until everyone boots mdsmap.ctime = g_clock.now(); + + /* for (int i=0; iget_type()) { - case MSG_MDS_BOOT: - handle_mds_boot((MMDSBoot*)m); + case MSG_MDS_BEACON: + handle_mds_beacon((MMDSBeacon*)m); break; case MSG_MDS_GETMAP: handle_mds_getmap((MMDSGetMap*)m); break; - /* - case MSG_MDS_FAILURE: - handle_mds_failure((MMDSFailure*)m); - break; - */ - - case MSG_SHUTDOWN: - handle_mds_shutdown(m); - break; - default: assert(0); } } -void MDSMonitor::handle_mds_boot(MMDSBoot *m) +void MDSMonitor::print_map() +{ + dout(7) << "print_map epoch " << mdsmap.get_epoch() << endl; + entity_inst_t blank; + for (set::iterator p = mdsmap.get_mds_set().begin(); + p != mdsmap.get_mds_set().end(); + ++p) { + dout(7) << " mds" << *p + << " : " << MDSMap::get_state_name(mdsmap.get_state(*p)) + << " : " << (mdsmap.mds_inst.count(*p) ? mdsmap.get_inst(*p) : blank) + << endl; + } +} + +/* +void MDSMonitor::handle_mds_state(MMDSState *m) { - dout(7) << "mds_boot from " << m->get_source() << " at " << m->get_source_inst() << endl; + dout(7) << "mds_state " << MDSMap::get_state_name(m->get_state()) + << " from " << m->get_source() << " at " << m->get_source_inst() << endl; assert(m->get_source().is_mds()); int from = m->get_source().num(); - - // choose an MDS id - if (from < 0 || !mdsmap.is_down(from)) { - for (from=0; ; ++from) - if (mdsmap.is_down(from)) break; - dout(10) << "mds_boot assigned mds" << from << endl; - } + int state = m->get_state(); - if (mdsmap.get_epoch() == 0) { - // waiting for boot! - mdsmap.mds_inst[from] = m->get_source_inst(); - mdsmap.mds_state[from] = MDSMap::STATE_STARTING; + if (state == MDSMap::STATE_STARTING) { + // MDS BOOT + + // choose an MDS id + if (from >= 0) { + // wants to be a specific MDS. + if (mdsmap.is_down(from) || + mdsmap.get_inst(from) == m->get_source_inst()) { + // fine, whatever. + dout(10) << "mds_state assigning requested mds" << from << endl; + } else { + dout(10) << "mds_state not assigning requested mds" << from + << ", that mds is up and someone else" << endl; + from = -1; + } + } + if (from < 0) { + // pick a failed mds? + for (set::iterator p = mdsmap.mds_set.begin(); + p != mdsmap.mds_set.end(); + ++p) { + if (mdsmap.is_failed(*p)) { + dout(10) << "mds_state assigned failed mds" << from << endl; + from = *p; + break; + } + } + } + if (from < 0) { + // ok, just pick any unused mds id. + for (from=0; ; ++from) { + if (mdsmap.is_out(from)) { + dout(10) << "mds_state assigned unused mds" << from << endl; + break; + } + } + } - if ((int)mdsmap.mds_inst.size() == mdsmap.get_num_mds()) { - mdsmap.inc_epoch(); - dout(-7) << "mds_boot all MDSs booted." << endl; - mdsmap.encode(maps[mdsmap.get_epoch()]); // 1 - - bcast_latest_mds(); - send_current(); - } else { - dout(7) << "mds_boot waiting for " - << (mdsmap.get_num_mds() - mdsmap.mds_inst.size()) - << " mdss to boot" << endl; + // add to map + mdsmap.mds_set.insert(from); + mdsmap.mds_inst[from] = m->get_source_inst(); + if (!mdsmap.mds_state.count(from)) { + // this is a new MDS! + state = MDSMap::STATE_CREATING; } - return; + } + else if (state == MDSMap::STATE_ACTIVE) { + // MDS now active + assert(mdsmap.is_starting(from) || + mdsmap.is_creating(from)); + } + + // update mds state + if (mdsmap.mds_state.count(from)) { + dout(10) << "mds_state mds" << from << " " << MDSMap::get_state_name(mdsmap.mds_state[from]) + << " -> " << MDSMap::get_state_name(state) + << endl; } else { - dout(0) << "mds_boot everyone already booted, so who is this? write me." << endl; - assert(0); + dout(10) << "mds_state mds" << from << " is new" + << " -> " << MDSMap::get_state_name(state) + << endl; } + mdsmap.mds_state[from] = state; + + // inc map version + mdsmap.inc_epoch(); + mdsmap.encode(maps[mdsmap.get_epoch()]); + + // bcast map + bcast_latest_mds(); + send_current(); } +*/ -void MDSMonitor::handle_mds_shutdown(Message *m) +void MDSMonitor::handle_mds_beacon(MMDSBeacon *m) { - assert(m->get_source().is_mds()); + dout(7) << "mds_beacon " << *m + << " from " << m->get_source() + << " " << m->get_source_inst() + << endl; int from = m->get_source().num(); + int state = m->get_state(); + version_t seq = m->get_seq(); - mdsmap.remove_mds(from); + // initial boot? + if (state == MDSMap::STATE_STARTING) { + bool booted = false; - dout(7) << "mds_shutdown from " << m->get_source() - << ", still have " << mdsmap.mds_set - << endl; - - // tell someone? - // fixme - - delete m; + // choose an MDS id + if (from >= 0) { + // wants to be a specific MDS. + if (mdsmap.is_down(from) || + mdsmap.get_inst(from) == m->get_source_inst()) { + // fine, whatever. + dout(10) << "mds_beacon assigning requested mds" << from << endl; + booted = true; + } else { + dout(10) << "mds_beacon not assigning requested mds" << from + << ", that mds is up and someone else" << endl; + from = -1; + } + } + if (from < 0) { + // pick a failed mds? + for (set::iterator p = mdsmap.mds_set.begin(); + p != mdsmap.mds_set.end(); + ++p) { + if (mdsmap.is_failed(*p)) { + dout(10) << "mds_beacon assigned failed mds" << from << endl; + from = *p; + booted = true; + break; + } + } + } + if (from < 0) { + // ok, just pick any unused mds id. + for (from=0; ; ++from) { + if (mdsmap.is_out(from)) { + dout(10) << "mds_beacon assigned unused mds" << from << endl; + booted = true; + break; + } + } + } + + // make sure it's in the map + if (booted) { + mdsmap.mds_set.insert(from); + mdsmap.mds_inst[from] = m->get_source_inst(); + } + + if (!mdsmap.mds_state.count(from) || + mdsmap.mds_state[from] == MDSMap::STATE_CREATING) + state = MDSMap::STATE_CREATING; // mds may not know it needs to create + } + + // bad beacon? + if (mdsmap.is_up(from) && + mdsmap.get_inst(from) != m->get_source_inst()) { + dout(7) << "mds_beacon has mismatched inst, dropping" << endl; + delete m; + return; + } + + if (mdsmap.mds_state_seq[from] > seq) { + dout(7) << "mds_beacon " << *m << " has old seq, ignoring" << endl; + delete m; + return; + } + + + // reply to beacon. + last_beacon[from] = g_clock.now(); // note time + messenger->send_message(m, MSG_ADDR_MDS(from), m->get_source_inst()); + + + // did we update the map? + if (mdsmap.mds_state[from] != state) { + // update mds state + dout(10) << "mds_beacon mds" << from << " " << MDSMap::get_state_name(mdsmap.mds_state[from]) + << " -> " << MDSMap::get_state_name(state) + << endl; + mdsmap.mds_state[from] = state; + mdsmap.mds_state_seq[from] = seq; + + // inc map version + mdsmap.inc_epoch(); + mdsmap.encode(maps[mdsmap.get_epoch()]); + + print_map(); + + // bcast map + bcast_latest_mds(); + send_current(); + } } diff --git a/branches/sage/cephmds2/mon/MDSMonitor.h b/branches/sage/cephmds2/mon/MDSMonitor.h index 58cb8912f0bf6..cd3ea6f58403a 100644 --- a/branches/sage/cephmds2/mon/MDSMonitor.h +++ b/branches/sage/cephmds2/mon/MDSMonitor.h @@ -41,7 +41,12 @@ class MDSMonitor : public Dispatcher { //MDSMap::Incremental pending_inc; map awaiting_map; - + + // beacons + map last_beacon; + + bool is_alive(int mds); + // maps void create_initial(); @@ -49,13 +54,15 @@ class MDSMonitor : public Dispatcher { void send_full(msg_addr_t dest, const entity_inst_t& inst); void bcast_latest_mds(); + void print_map(); + //void accept_pending(); // accept pending, new map. //void send_incremental(epoch_t since, msg_addr_t dest); - void handle_mds_boot(class MMDSBoot *m); - void handle_mds_failure(class MMDSFailure *m); + void handle_mds_state(class MMDSState *m); + void handle_mds_beacon(class MMDSBeacon *m); + //void handle_mds_failure(class MMDSFailure *m); void handle_mds_getmap(class MMDSGetMap *m); - void handle_mds_shutdown(Message *m); diff --git a/branches/sage/cephmds2/mon/Monitor.cc b/branches/sage/cephmds2/mon/Monitor.cc index acba0e9f5e45a..f949972ffaf07 100644 --- a/branches/sage/cephmds2/mon/Monitor.cc +++ b/branches/sage/cephmds2/mon/Monitor.cc @@ -152,7 +152,7 @@ void Monitor::dispatch(Message *m) // MDSs - case MSG_MDS_BOOT: + case MSG_MDS_BEACON: case MSG_MDS_GETMAP: mdsmon->dispatch(m); break; diff --git a/branches/sage/cephmds2/mon/OSDMonitor.cc b/branches/sage/cephmds2/mon/OSDMonitor.cc index f81b397fa2ded..33a0f530ef86a 100644 --- a/branches/sage/cephmds2/mon/OSDMonitor.cc +++ b/branches/sage/cephmds2/mon/OSDMonitor.cc @@ -570,7 +570,7 @@ void OSDMonitor::bcast_latest_mds() for (set::iterator i = mon->mdsmon->mdsmap.get_mds_set().begin(); i != mon->mdsmon->mdsmap.get_mds_set().end(); i++) { - if (mon->mdsmon->mdsmap.is_down(*i)) continue; + if (mon->mdsmon->mdsmap.is_out(*i) || mon->mdsmon->mdsmap.is_down(*i)) continue; send_incremental(osdmap.get_epoch()-1, MSG_ADDR_MDS(*i), mon->mdsmon->mdsmap.get_inst(*i)); } } diff --git a/branches/sage/cephmds2/msg/Message.cc b/branches/sage/cephmds2/msg/Message.cc index 2ce6b8bbf1272..2c6baa7aab923 100644 --- a/branches/sage/cephmds2/msg/Message.cc +++ b/branches/sage/cephmds2/msg/Message.cc @@ -50,7 +50,7 @@ using namespace std; #include "messages/MMDSGetMap.h" #include "messages/MMDSMap.h" -#include "messages/MMDSBoot.h" +#include "messages/MMDSBeacon.h" #include "messages/MDirUpdate.h" #include "messages/MDiscover.h" @@ -245,8 +245,8 @@ decode_message(msg_envelope_t& env, bufferlist& payload) case MSG_MDS_MAP: m = new MMDSMap(); break; - case MSG_MDS_BOOT: - m = new MMDSBoot(); + case MSG_MDS_BEACON: + m = new MMDSBeacon; break; case MSG_MDS_DIRUPDATE: diff --git a/branches/sage/cephmds2/msg/Message.h b/branches/sage/cephmds2/msg/Message.h index 5f41453e9ec40..c726fed07268b 100644 --- a/branches/sage/cephmds2/msg/Message.h +++ b/branches/sage/cephmds2/msg/Message.h @@ -87,10 +87,10 @@ // *** MDS *** -#define MSG_MDS_BOOT 100 -#define MSG_MDS_GETMAP 101 -#define MSG_MDS_MAP 102 -#define MSG_MDS_HEARTBEAT 103 +#define MSG_MDS_GETMAP 102 +#define MSG_MDS_MAP 103 +#define MSG_MDS_HEARTBEAT 104 // for mds load balancer +#define MSG_MDS_BEACON 105 // to monitor #define MSG_MDS_DISCOVER 110 #define MSG_MDS_DISCOVERREPLY 111 -- 2.39.5