-# mpicxx must be on your path; on googoo, this means that
-# /usr/local/mpich2-1.0.2/bin must be on your path.
+# mpicxx must be on your path to build newsyn. on googoo, this means
+# that /usr/local/mpich2-1.0.2/bin must be on your path.
# For now, use g++ most of the time.
-# When compiling MPI stuff, specify myfile.cc instead of myfile.o so that ${MPICC} is
-# invoked instead of the generic .o rule (or it'll use g++).
-# This makes it less annoying to build on non-mpi hosts for dev work, and seems to
-# behave just fine... change ${CC} back to mpicxx if you get paranoid.
+# When compiling MPI stuff, specify myfile.cc instead of myfile.o so
+# that ${MPICC} is invoked instead of the generic .o rule (or it'll
+# use g++). This makes it less annoying to build on non-mpi hosts for
+# dev work, and seems to behave just fine... change ${CC} back to
+# mpicxx if you get paranoid.
#CC = g++
#CFLAGS = -g -Wall -I. -D_FILE_OFFSET_BITS=64 -DMPICH_IGNORE_CXX_SEEK -D_REENTRANT -D_THREAD_SAFE
mon/MonitorStore.o
COMMON_OBJS= \
- msg/Messenger.o\
msg/Message.o\
common/Logger.o\
common/Clock.o\
client/SyntheticClient.o\
client/Trace.o
-TARGETS = cmon cosd cmds cfuse newsyn fakesyn
+TARGETS = cmon cosd cmds cfuse csyn newsyn fakesyn
SRCS=*.cc */*.cc *.h */*.h */*/*.h
gcc -shared -fPIC test/gprof-helper.c -o gprof-helper.so -lpthread -ldl
-
-# fuse
-fakefuse: fakefuse.cc mon.o mds.o client.o osd.o osdc.o ebofs.o client/fuse.o msg/FakeMessenger.cc common.o
+# fake*
+fakefuse: fakefuse.cc mon.o mds.o client.o osd.o osdc.o ebofs.o client/fuse.o msg/FakeMessenger.o common.o
${CC} -pg ${CFLAGS} ${LIBS} -lfuse $^ -o $@
-tcpfuse: tcpfuse.cc mds.o client.o client/fuse.o ${TCP_OBJS} common.o
- ${MPICC} ${CFLAGS} ${LIBS} -lfuse $^ -o $@
-
-mpifuse: mpifuse.cc mds.o client.o client/fuse.o ${TCP_OBJS} common.o
- ${MPICC} ${CFLAGS} ${LIBS} -lfuse $^ -o $@
-
-
-# synthetic workload
fakesyn: fakesyn.cc mon.o mds.o client.o osd.o ebofs.o osdc.o msg/FakeMessenger.o common.o
${CC} -pg ${CFLAGS} ${LIBS} $^ -o $@
-tcpsyn: tcpsyn.cc mon.o mds.o client.o osd.o ebofs.o osdc.o ${TCP_OBJS} common.o
- ${MPICC} ${MPICFLAGS} ${MPILIBS} $^ -o $@
+# mpi startup
newsyn: newsyn.cc mon.o mds.o client.o osd.o ebofs.o osdc.o msg/SimpleMessenger.o common.o
${MPICC} -pg ${MPICFLAGS} ${MPILIBS} $^ -o $@
-newsyn.nopg: newsyn.cc mon.o mds.o client.o osd.o ebofs.o osdc.o msg/NewerMessenger.o common.o
+newsyn.nopg: newsyn.cc mon.o mds.o client.o osd.o ebofs.o osdc.o msg/SimpleMessenger.o common.o
${MPICC} ${MPICFLAGS} ${MPILIBS} $^ -o $@
-# + obfs
-fakesynobfs: fakesyn.cc mds.o client.o osd_obfs.o msg/FakeMessenger.o common.o
- ${CC} -DUSE_OBFS ${CFLAGS} ${LIBS} $^ -o $@
-
-tcpsynobfs: tcpsyn.cc mds.o client.o osd_obfs.o ${TCP_OBJS} common.o
- ${MPICC} -DUSE_OBFS ${MPICFLAGS} ${MPILIBS} $^ -o $@
-
# ebofs
-
mkfs.ebofs: ebofs/mkfs.ebofs.cc config.cc common/Clock.o ebofs.o
${CC} -pg ${CFLAGS} ${LIBS} $^ -o $@
${CC} -pg ${CFLAGS} ${LIBS} $^ -o $@
+# + obfs (old)
+fakesynobfs: fakesyn.cc mds.o client.o osd_obfs.o msg/FakeMessenger.o common.o
+ ${CC} -DUSE_OBFS ${CFLAGS} ${LIBS} $^ -o $@
+
+tcpsynobfs: tcpsyn.cc mds.o client.o osd_obfs.o ${TCP_OBJS} common.o
+ ${MPICC} -DUSE_OBFS ${MPICFLAGS} ${MPILIBS} $^ -o $@
+
+osd_obfs.o: osd/OBFSStore.o osd/OSD.cc osd/PG.o osd/ObjectStore.o osd/FakeStore.o
+ ${MPICC} -DUSE_OBFS ${MPICFLAGS} ${MPILIBS} $^ -o $@ ../uofs/uofs.a
+
# libceph
-libceph.o: client/ldceph.o client/Client.o ${COMMON_OBJS} ${SYN_OBJS} ${OSDC_OBJS}
- ${LDINC} $@ $^
+libceph.o: client/ldceph.o client/Client.o msg/SimpleMessenger.o ${COMMON_OBJS} ${SYN_OBJS} ${OSDC_OBJS}
+ ${LDINC} $^ -o $@
bench/mdtest/mdtest.o: bench/mdtest/mdtest.c
mpicc -c $^ -o $@
osdc.o: ${OSDC_OBJS}
${LDINC} $@ $^
-osd_obfs.o: osd/OBFSStore.o osd/OSD.cc osd/PG.o osd/ObjectStore.o osd/FakeStore.o
- ${MPICC} -DUSE_OBFS ${MPICFLAGS} ${MPILIBS} $^ -o $@ ../uofs/uofs.o
-
mds.o: ${MDS_OBJS}
${LDINC} $@ $^
cat ${SRCS} | wc -l
cat ${SRCS} | grep -c \;
+TAGS:
+ etags `find . -name "*.[h|cc]"`
+
.depend:
touch .depend
-- paxos for monitor
-- lnet?
-- crush
- - xml import/export?
- - crush tools
-
-== todo
-
-1- pipelining writes?
-2- intervening reads?
-
-inode ops
- utime -- no concurrency issues
- chown/chmod -- should lock
- truncate -- should lock
- 1-> no. multiple process concurrency on a single inode is not important.
- 2-> maybe... intervening stats? probably not important.
-
-directory ops. parent inode mtime, + dirent xlocks?
- mknod
- open+create
- symlink
- unlink
- rmdir
- rename
- 1-> yes. but mtime updates are independent (mtime monotonically increasing), so it's easy.
- 2-> yes.
-
---> so, make let's make file/hard wrlock exclusive.
-
-locks
- namespace
- path pins -- read lock
- dentry xlock -- write lock
- inode
- hard/file rd start/stop -- read lock
- hard/file wr start/stop -- write lock
-
+monitor
+- finish generic paxos
+osdmon
+- distribute w/ paxos framework
+- allow fresh replacement osds. add osd_created in osdmap, probably
+- monitor needs to monitor some osds...
+- monitor pg states, notify on out?
+- watch osd utilization; adjust overload in cluster map
-- integrate revisions into ObjectCacher
-- clean up oid.rev vs op.rev in osd+osdc
+mdsmon
+- distribute w/ paxos framework
+
+journaler
+- fix up for large events (e.g. imports)
+- use set_floor_and_read for safe takeover from possibly-not-quite-dead otherguy.
+- should we pad with zeros to avoid splitting individual entries?
+ - make it a g_conf flag?
+ - have to fix reader to skip over zeros (either <4 bytes for size, or zeroed sizes)
+- need to truncate at detected (valid) write_pos to clear out any other partial trailing writes
+
+
+crush
+- xml import/export?
+- crush tools
+
+
+rados+ebofs
+- purge replicated writes from cache. (with exception of partial tail blocks.)
-rados paper todo
+rados paper todo?
- better experiments
+ - berkeleydb objectstore?
- flush log only in response to subsequent read or write?
- better behaving recovery
- justify use of splay.
- snapshots
rados snapshots
+- integrate revisions into ObjectCacher
+- clean up oid.rev vs op.rev in osd+osdc
+
- attr.crev is rev we were created in.
- oid.rev=0 is "live". defined for attr.crev <= rev.
- otherwise, defined for attr.crev <= rev < oid.rev (i.e. oid.rev is upper bound, non-inclusive.)
- clean up messenger failure modes.
- add connection retry.
-mds recovery
-- multiple passes?
- 1- establish import/export map
- ?-
- 2- replay inode, dir, dentry updates
-- single pass
- - each event needs to embed inode for trace up to the import
- - second stage will reconcile cached items with other active mds nodes
- - cached items will be shared with the primary to repopulate it's non-dirty cache
- - query clients for their state too?
- - mds must journal list of clients with whom we share state?
-
-
-journaler
-- should we pad with zeros to avoid splitting individual entries?
- - make it a g_conf flag?
- - have to fix reader to skip over zeros (either <4 bytes for size, or zeroed sizes)
-- need to truncate at detected (valid) write_pos to clear out any other partial trailing writes
-
-monitor
-?- monitor user lib that handles resending, redirection of mon requests.
-- elector
-/- organize monitor store
-
-osdmon
-- distribute
-- recovery: store elector epochs with maps..
-- monitor needs to monitor some osds...
-- monitor pgs, notify on out
-- watch osd utilization; adjust overload in cluster map
-
-mdsmon
+objecter
+- read+floor_lockout
osd/rados
+- read+floor_lockout for clean STOGITH-like/fencing semantics after failover.
+- separate out replication code into a PG class, to pave way for RAID
+
- efficiently replicate clone() objects
- pg_num instead of pg_bits
- flag missing log entries on crash recovery --> WRNOOP? or WRLOST?
- pg_bit/pg_num changes
- report crashed pgs?
-messenger
-/- share same tcp socket for sender and receiver
-/- graceful connection teardown
+simplemessenger
- close idle connections
-- generalize out a transport layer?
- - eg reliable tcp for most things, connectionless unreliable datagrams for monitors?
- - or, aggressive connection closing on monitors? or just max_connections and an lru?
-- osds: forget idle client addrs
-
-objecter
+- retry, timeout on connection or transmission failure
objectcacher
- ocacher caps transitions vs locks
- test read locks
reliability
-- heartbeat vs ping
+- heartbeat vs ping?
- osdmonitor, filter
ebofs
- metadata in nvram? flash?
-
-bugs/stability
-- figure out weird 40ms latency with double log entries
-
-
-general
-- timer needs cancel sets, schedulers need to cancel outstanding events on shutdown
-- well, just figure out general timer cancellation strategy that avoids races
- - use updated Timer as a model?
-
-
remaining hard problems
- how to cope with file size changes and read/write sharing
-- mds failure recovery (of course)
crush
- distributed client management
- anchormgr
- 2pc
- - independent journal
+ - independent journal?
- distributed?
- link count management
- also 2pc
client
-- mixed lazy and non-lazy io will clobber each others' caps in the buffer cache
-
-- test client caps with meta exports
-- some heuristic behavior to consolidate caps to inode auth
-- client will re-tx anything it needed to say upon rx of new mds notification (?)
-
-
-
+- fstat
+- make_request: cope with mds failure
+- mixed lazy and non-lazy io will clobber each others' caps in the buffer cache.. how to isolate..
+- test client caps migration w/ mds exports
+- some heuristic behavior to consolidate caps to inode auth?
-CLIENT TODO
-
-- statfs
-
-
- dump active config in run output somewhere
+
+
+
+
+
+
+
+
+==== MDS RECOVERY ====
+
+- how to reliably deliver cache expire messages?
+ - how should proxy behave?
+ - exporter failure
+ - all cacheexpire info has been passed on up until point where export is permanent. no impact.
+ - importer failure
+ - exporter collects expire info, so that it can reverse.
+ - ???
+ - maybe hosts should double-up expires until after export is known to have committed?
+--> just send expires to both nodes. dir_auth+dir_auth2. clean up export ack/notify process. :)
+
+*** dar... no, separate bystander dir_auth updates from the prepare/ack/commit cycle!
+- expire should go to both old and new auth
+- set_dir_auth should take optional second auth, and authority() should optionally set/return a second possible auth
+- does inode need it's own replica list? no!
+- dirslices.
+
+
+/- exporter recovery if importer fails during EXPORT_EXPORTING stage
+- importer recovery if exporter fails
+
+/?- delay response to sending import_map if export in progress?
+/?- finish export before sending import_map?
+/- ambiguous imports on active node should include in-progress imports!
+/- how to effectively trim cache after resolve but before rejoin
+/ - we need to eliminate unneed non-auth metadata, without hosing potentially useful auth metadata
+
+- osd needs a set_floor_and_read op for safe failover/STOGITH-like semantics.
+
+- failures during recovery stages (resolve, rejoin)... make sure rejoin still works!
+
+- fix mds initial osdmap weirdness (which will currently screw up on standby -> almost anything)
+
+
+importmap only sent after exports have completed.
+failures update export ack waitlists, so exports will compelte if unrelated nodes fail.
+importmap can be sent regardless of import status -- pending import is just flagged ambiguous.
+failure of exporter induces some cleanup on importer. importer will disambiguate when it gets an importmap on exporter recovery.
+failure of importer induces cleanup on exporter. no ambiguity.
+
+
+/- no new mds may join if cluster is in a recovery state. starting -> standby (unless failed)
+/ - make sure creating -> standby, and are not included in recovery set?
+
+
+mdsmap notes
+- mds don't care about intervening states, except rejoin > active, and
+ that transition requires active involvement. thus, no need worry
+ about delivering/processing the full sequence of maps.
+
+blech:
+- EMetablob should return 'expired' if they have
+ higher versions (and are thus described by a newer journal entry)
+
+mds
+- mds falure vs clients
+ - clean up client op redirection
+ - idempotent ops
+
+- journal+recovery
+ - unlink
+ - open(wr cap), open+create
+ - file capabilities i/o
+ - link
+ - rename
+
+- should auth_pins really go to the root?
+ - FIXME: auth_pins on importer versus import beneath an authpinned region?
+
#include "config.h"
#undef dout
-#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_client) cout << "client" << whoami << "." << pthread_self() << " "
+#define dout(l) if (l<=g_conf.debug || l <= g_conf.debug_client) cout << g_clock.now() << " client" << whoami << "." << pthread_self() << " "
#define tout if (g_conf.client_trace) cout << "trace: "
Client::Client(Messenger *m, MonMap *mm)
{
// which client am i?
- whoami = m->get_myaddr().num();
+ whoami = m->get_myname().num();
monmap = mm;
mounted = false;
MClientReply *Client::make_request(MClientRequest *req,
bool auth_best,
- int use_mds) // this param is icky, debug weirdness!
+ int use_mds) // this param is purely for debug hacking
{
// assign a unique tid
req->set_tid(++last_tid);
// choose an mds
int mds = 0;
- if (diri) {
+ if (!diri || g_conf.client_use_random_mds) {
+ // no root info, pick a random MDS
+ mds = rand() % mdsmap->get_num_mds();
+ } else {
if (auth_best) {
// pick the actual auth (as best we can)
if (item) {
else
mds = diri->pick_replica(mdsmap);
}
- } else {
- // no root info, pick a random MDS
- mds = rand() % mdsmap->get_num_mds();
}
dout(20) << "mds is " << mds << endl;
tid_t tid = req->get_tid();
mds_rpc_cond[tid] = &cond;
- messenger->send_message(req, MSG_ADDR_MDS(mds), mdsmap->get_inst(mds), MDS_PORT_SERVER);
+ messenger->send_message(req, mdsmap->get_inst(mds), MDS_PORT_SERVER);
// wait
while (mds_rpc_reply.count(tid) == 0) {
if (whoami < 0) {
whoami = m->get_dest().num();
dout(1) << "handle_mds_map i am now " << m->get_dest() << endl;
- messenger->reset_myaddr(m->get_dest());
+ messenger->reset_myname(m->get_dest());
}
- map<epoch_t, bufferlist>::reverse_iterator p = m->maps.rbegin();
-
- dout(1) << "handle_mds_map epoch " << p->first << endl;
- mdsmap->decode(p->second);
+ dout(1) << "handle_mds_map epoch " << m->get_epoch() << endl;
+ mdsmap->decode(m->get_encoded());
delete m;
+ // note our inc #
+ objecter->set_client_incarnation(0); // fixme
+
mount_cond.Signal(); // mount might be waiting for this.
}
<< ", which we don't want caps for, releasing." << endl;
m->set_caps(0);
m->set_wanted(0);
- messenger->send_message(m, m->get_source(), m->get_source_inst(), m->get_source_port());
+ messenger->send_message(m, m->get_source_inst(), m->get_source_port());
return;
}
in->file_wr_size = 0;
}
- messenger->send_message(m, m->get_source(), m->get_source_inst(), m->get_source_port());
+ messenger->send_message(m, m->get_source_inst(), m->get_source_port());
}
it->second.seq,
it->second.caps,
in->file_caps_wanted());
- messenger->send_message(m, MSG_ADDR_MDS(it->first), mdsmap->get_inst(it->first), MDS_PORT_LOCKER);
+ messenger->send_message(m, mdsmap->get_inst(it->first), MDS_PORT_LOCKER);
}
}
it->second.caps,
in->file_caps_wanted());
messenger->send_message(m,
- MSG_ADDR_MDS(it->first), mdsmap->get_inst(it->first), MDS_PORT_LOCKER);
+ mdsmap->get_inst(it->first), MDS_PORT_LOCKER);
}
}
delete mdsmap;
int mon = monmap->pick_mon();
messenger->send_message(new MClientBoot(),
- MSG_ADDR_MON(mon), monmap->get_inst(mon));
+ monmap->get_inst(mon));
while (!mdsmap)
mount_cond.Wait(client_lock);
int who = 0; // mdsmap->get_root(); // mount at root, for now
messenger->send_message(m,
- MSG_ADDR_MDS(who), mdsmap->get_inst(who),
+ mdsmap->get_inst(who),
MDS_PORT_SERVER);
while (!mounted)
// send unmount!
Message *req = new MGenericMessage(MSG_CLIENT_UNMOUNT);
- messenger->send_message(req, MSG_ADDR_MDS(0), mdsmap->get_inst(0), MDS_PORT_SERVER);
+ messenger->send_message(req, mdsmap->get_inst(0), MDS_PORT_SERVER);
while (mounted)
mount_cond.Wait(client_lock);
}
-void Client::ms_handle_failure(Message *m, msg_addr_t dest, const entity_inst_t& inst)
+void Client::ms_handle_failure(Message *m, entity_name_t dest, const entity_inst_t& inst)
{
if (dest.is_mon()) {
// resend to a different monitor.
dout(0) << "ms_handle_failure " << dest << " inst " << inst
<< ", resending to mon" << mon
<< endl;
- messenger->send_message(m, MSG_ADDR_MON(mon), monmap->get_inst(mon));
+ messenger->send_message(m, monmap->get_inst(mon));
}
else if (dest.is_osd()) {
objecter->ms_handle_failure(m, dest, inst);
int unsafe_sync_write;
public:
- msg_addr_t get_myaddr() { return messenger->get_myaddr(); }
+ entity_name_t get_myname() { return messenger->get_myname(); }
void hack_sync_write_safe();
protected:
int describe_layout(char *fn, list<ObjectExtent>& result);
- void ms_handle_failure(Message*, msg_addr_t dest, const entity_inst_t& inst);
+ void ms_handle_failure(Message*, entity_name_t dest, const entity_inst_t& inst);
};
#endif
*/
#include <iostream>
+#include <sstream>
using namespace std;
#include "config.h"
#undef dout
-#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_client) cout << "synthetic" << client->get_nodeid() << " "
+#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_client) cout << g_clock.now() << " synthetic" << client->get_nodeid() << " "
// traces
//void trace_include(SyntheticClient *syn, Client *cl, string& prefix);
syn_iargs.push_back( atoi(args[++i]) );
syn_iargs.push_back( atoi(args[++i]) );
syn_iargs.push_back( atoi(args[++i]) );
+ } else if (strcmp(args[i],"makedirmess") == 0) {
+ syn_modes.push_back( SYNCLIENT_MODE_MAKEDIRMESS );
+ syn_iargs.push_back( atoi(args[++i]) );
} else if (strcmp(args[i],"statdirs") == 0) {
syn_modes.push_back( SYNCLIENT_MODE_STATDIRS );
syn_iargs.push_back( atoi(args[++i]) );
syn_iargs.push_back( atoi(args[++i]) );
syn_iargs.push_back( atoi(args[++i]) );
- } else if (strcmp(args[i],"fullwalk") == 0) {
+ } else if (strcmp(args[i],"walk") == 0) {
syn_modes.push_back( SYNCLIENT_MODE_FULLWALK );
//syn_sargs.push_back( atoi(args[++i]) );
} else if (strcmp(args[i],"randomwalk") == 0) {
}
break;
+ case SYNCLIENT_MODE_MAKEDIRMESS:
+ {
+ string sarg1 = get_sarg(0);
+ int iarg1 = iargs.front(); iargs.pop_front();
+ if (run_me()) {
+ dout(2) << "makedirmess " << sarg1 << " " << iarg1 << endl;
+ make_dir_mess(sarg1.c_str(), iarg1);
+ }
+ }
+ break;
case SYNCLIENT_MODE_MAKEDIRS:
{
string sarg1 = get_sarg(0);
case SYNCLIENT_MODE_FULLWALK:
{
- string sarg1 = get_sarg(0);
+ string sarg1;// = get_sarg(0);
if (run_me()) {
dout(2) << "fullwalk" << sarg1 << endl;
full_walk(sarg1);
{
if (time_to_stop()) return -1;
- // read dir
- map<string, inode_t> contents;
- int r = client->getdir(basedir.c_str(), contents);
- if (r < 0) {
- dout(1) << "readdir on " << basedir << " returns " << r << endl;
- return r;
- }
+ list<string> dirq;
+ dirq.push_back(basedir);
- for (map<string, inode_t>::iterator it = contents.begin();
- it != contents.end();
- it++) {
- string file = basedir + "/" + it->first;
+ while (!dirq.empty()) {
+ string dir = dirq.front();
+ dirq.pop_front();
- struct stat st;
- int r = client->lstat(file.c_str(), &st);
+ // read dir
+ map<string, inode_t> contents;
+ int r = client->getdir(dir.c_str(), contents);
if (r < 0) {
- dout(1) << "stat error on " << file << " r=" << r << endl;
+ dout(1) << "readdir on " << dir << " returns " << r << endl;
continue;
}
-
- if ((st.st_mode & INODE_TYPE_MASK) == INODE_MODE_DIR) full_walk(file);
+
+ for (map<string, inode_t>::iterator it = contents.begin();
+ it != contents.end();
+ it++) {
+ if (it->first == ".") continue;
+ if (it->first == "..") continue;
+ string file = dir + "/" + it->first;
+
+ struct stat st;
+ int r = client->lstat(file.c_str(), &st);
+ if (r < 0) {
+ dout(1) << "stat error on " << file << " r=" << r << endl;
+ continue;
+ }
+
+ if ((st.st_mode & INODE_TYPE_MASK) == INODE_MODE_DIR) {
+ dirq.push_back(file);
+ }
+ }
}
return 0;
}
+
+
+void SyntheticClient::make_dir_mess(const char *basedir, int n)
+{
+ vector<string> dirs;
+
+ dirs.push_back(basedir);
+ dirs.push_back(basedir);
+
+ client->mkdir(basedir, 0755);
+
+ // motivation:
+ // P(dir) ~ subdirs_of(dir) + 2
+ // from 5-year metadata workload paper in fast'07
+
+ // create dirs
+ for (int i=0; i<n; i++) {
+ // pick a dir
+ int k = rand() % dirs.size();
+ string parent = dirs[k];
+
+ // pick a name
+ std::stringstream ss;
+ ss << parent << "/" << i;
+ string dir;
+ ss >> dir;
+
+ // update dirs
+ dirs.push_back(parent);
+ dirs.push_back(dir);
+ dirs.push_back(dir);
+
+ // do it
+ client->mkdir(dir.c_str(), 0755);
+ }
+
+
+}
+
#define SYNCLIENT_MODE_RANDOMWALK 1
#define SYNCLIENT_MODE_FULLWALK 2
-#define SYNCLIENT_MODE_REPEATWALK 7
+#define SYNCLIENT_MODE_REPEATWALK 3
+#define SYNCLIENT_MODE_MAKEDIRMESS 7
#define SYNCLIENT_MODE_MAKEDIRS 8 // dirs files depth
#define SYNCLIENT_MODE_STATDIRS 9 // dirs files depth
#define SYNCLIENT_MODE_READDIRS 10 // dirs files depth
int play_trace(Trace& t, string& prefix);
+ void make_dir_mess(const char *basedir, int n);
+
};
#endif
if (g_conf.debug_after)
g_timer.add_event_after(g_conf.debug_after, new C_Debug);
+ // mds specific args
+ int whoami = -1;
+ bool standby = false; // by default, i'll start active.
+ for (unsigned i=0; i<args.size(); i++) {
+ if (strcmp(args[i], "--standby") == 0)
+ standby = true;
+ else if (strcmp(args[i], "--mds") == 0)
+ whoami = atoi(args[++i]);
+ else {
+ cerr << "unrecognized arg " << args[i] << endl;
+ return -1;
+ }
+ }
+
// load monmap
MonMap monmap;
rank.start_rank();
// start mds
- Messenger *m = rank.register_entity(MSG_ADDR_MDS_NEW);
+ Messenger *m = rank.register_entity(MSG_ADDR_MDS(whoami));
assert(m);
- MDS *mds = new MDS(m->get_myaddr().num(), m, &monmap);
- mds->init();
+ MDS *mds = new MDS(whoami, m, &monmap);
+ mds->init(standby);
// wait
rank.wait();
cout << "bound to " << rank.get_listen_addr() << endl;
// add single mon0
- monmap.add_mon(rank.my_inst);
+ entity_inst_t inst;
+ inst.name = MSG_ADDR_MON(0);
+ inst.addr = rank.my_addr;
+ monmap.add_mon(inst);
// write monmap
cout << "writing monmap to " << monmap_fn << endl;;
// bind to a specific port
cout << "starting mon" << whoami << " at " << monmap.get_inst(whoami) << endl;
- tcpaddr_t addr = monmap.get_inst(whoami).addr;
- rank.set_listen_addr(addr);
+ g_my_addr = monmap.get_inst(whoami).addr;
rank.start_rank();
}
}
return l;
}
+inline utime_t& operator-=(utime_t& l, double f) {
+ l += -f;
+ return l;
+}
inline bool operator>(const utime_t& a, const utime_t& b)
{
#include "include/Context.h"
#undef dout
-#define dout(x) if (x <= g_conf.debug) cout << "Timer: "
+#define dout(x) if (x <= g_conf.debug) cout << g_clock.now() << " TIMER "
+#define derr(x) if (x <= g_conf.debug) cerr << g_clock.now() << " TIMER "
#define DBL 10
Timer g_timer;
+
/**** thread solution *****/
+bool Timer::get_next_due(utime_t& when)
+{
+ if (scheduled.empty()) {
+ dout(10) << "get_next_due - nothing scheduled" << endl;
+ return false;
+ } else {
+ map< utime_t, set<Context*> >::iterator it = scheduled.begin();
+ when = it->first;
+ dout(10) << "get_next_due - " << when << endl;
+ return true;
+ }
+}
+
+
void Timer::timer_entry()
{
lock.Lock();
// any events due?
utime_t next;
- Context *event = get_next_scheduled(next);
-
- list<Context*> pending;
+ bool next_due = get_next_due(next);
- if (event && now >= next) {
+ if (next_due && now >= next) {
// move to pending list
- map< utime_t, multiset<Context*> >::iterator it = scheduled.begin();
+ list<Context*> pending;
+
+ map< utime_t, set<Context*> >::iterator it = scheduled.begin();
while (it != scheduled.end()) {
if (it->first > now) break;
utime_t t = it->first;
dout(DBL) << "queueing event(s) scheduled at " << t << endl;
- for (multiset<Context*>::iterator cit = it->second.begin();
+ for (set<Context*>::iterator cit = it->second.begin();
cit != it->second.end();
cit++) {
pending.push_back(*cit);
num_event--;
}
- map< utime_t, multiset<Context*> >::iterator previt = it;
+ map< utime_t, set<Context*> >::iterator previt = it;
it++;
scheduled.erase(previt);
}
if (!pending.empty()) {
sleeping = false;
lock.Unlock();
- { // make sure we're not holding any locks while we do callbacks
+ {
+ // make sure we're not holding any locks while we do callbacks
// make the callbacks myself.
for (list<Context*>::iterator cit = pending.begin();
cit != pending.end();
cit++) {
- dout(DBL) << "doing callback " << *cit << endl;
+ dout(DBL) << "start callback " << *cit << endl;
(*cit)->finish(0);
+ dout(DBL) << "finish callback " << *cit << endl;
+ delete *cit;
}
pending.clear();
assert(pending.empty());
}
}
-
else {
// sleep
- if (event) {
+ if (next_due) {
dout(DBL) << "sleeping until " << next << endl;
timed_sleep = true;
sleeping = true;
else
sleep_cond.SignalAll();
} else {
- dout(DBL) << "register_timer doing nothing; thread is alive but not sleeping" << endl;
+ dout(DBL) << "register_timer doing nothing; thread is awake" << endl;
// it's probably doing callbacks.
}
} else {
void Timer::add_event_at(utime_t when,
Context *callback)
{
- // insert
+ lock.Lock();
+
dout(DBL) << "add_event " << callback << " at " << when << endl;
- lock.Lock();
- scheduled[ when ].insert(callback);
- assert(event_times.count(callback) == 0); // err.. there can be only one (for now!)
+ // insert
+ scheduled[when].insert(callback);
+ assert(event_times.count(callback) == 0);
event_times[callback] = when;
num_event++;
-
- // make sure i wake up
+
+ // make sure i wake up on time
register_timer();
-
+
lock.Unlock();
}
dout(DBL) << "cancel_event " << callback << endl;
if (!event_times.count(callback)) {
- dout(DBL) << "cancel_event " << callback << " wasn't scheduled?" << endl;
+ dout(DBL) << "cancel_event " << callback << " isn't scheduled (probably executing)" << endl;
lock.Unlock();
- //assert(0);
return false; // wasn't scheduled.
}
utime_t tp = event_times[callback];
- assert(scheduled.count(tp));
-
- multiset<Context*>::iterator p = scheduled[tp].find(callback); // there may be more than one?
- assert(p != scheduled[tp].end());
- scheduled[tp].erase(p);
-
event_times.erase(callback);
+
+ assert(scheduled.count(tp));
+ assert(scheduled[tp].count(callback));
+ scheduled[tp].erase(callback);
+ if (scheduled[tp].empty())
+ scheduled.erase(tp);
lock.Unlock();
return true;
}
+
+
+// -------------------------------
+
+void SafeTimer::add_event_after(float seconds, Context *c)
+{
+ assert(lock.is_locked());
+ Context *w = new EventWrapper(this, c);
+ dout(DBL) << "SafeTimer.add_event_after wrapping " << c << " with " << w << endl;
+ scheduled[c] = w;
+ g_timer.add_event_after(seconds, w);
+}
+
+void SafeTimer::add_event_at(utime_t when, Context *c)
+{
+ assert(lock.is_locked());
+ Context *w = new EventWrapper(this, c);
+ dout(DBL) << "SafeTimer.add_event_at wrapping " << c << " with " << w << endl;
+ scheduled[c] = w;
+ g_timer.add_event_at(when, w);
+}
+
+void SafeTimer::EventWrapper::finish(int r)
+{
+ timer->lock.Lock();
+ if (timer->scheduled.count(actual)) {
+ // still scheduled. execute.
+ actual->finish(r);
+ timer->scheduled.erase(actual);
+ } else {
+ // i was canceled.
+ assert(timer->canceled.count(actual));
+ }
+
+ // did i get canceled?
+ // (this can happen even if i just executed above. e.g., i may have canceled myself.)
+ if (timer->canceled.count(actual)) {
+ timer->canceled.erase(actual);
+ timer->cond.Signal();
+ }
+
+ // delete the original event
+ delete actual;
+
+ timer->lock.Unlock();
+}
+
+void SafeTimer::cancel_event(Context *c)
+{
+ assert(lock.is_locked());
+ assert(scheduled.count(c));
+
+ if (g_timer.cancel_event(scheduled[c])) {
+ // hosed wrapper. hose original event too.
+ delete scheduled[c];
+ } else {
+ // clean up later.
+ canceled[c] = scheduled[c];
+ }
+ scheduled.erase(c);
+}
+
+void SafeTimer::cancel_all()
+{
+ assert(lock.is_locked());
+
+ while (!scheduled.empty())
+ cancel_event(scheduled.begin()->first);
+}
+
+void SafeTimer::join()
+{
+ assert(lock.is_locked());
+ assert(scheduled.empty());
+
+ while (!canceled.empty()) {
+ // wait
+ dout(-10) << "SafeTimer.join waiting for " << canceled.size() << " to join" << endl;
+ dout(-10) << canceled << endl;
+ cond.Wait(lock);
+ }
+}
+
+SafeTimer::~SafeTimer()
+{
+ if (!scheduled.empty() && !canceled.empty()) {
+ derr(0) << "SafeTimer.~SafeTimer " << scheduled.size() << " events scheduled, "
+ << canceled.size() << " canceled but unflushed"
+ << endl;
+ }
+}
class Timer {
private:
- map< utime_t, multiset<Context*> > scheduled; // time -> (context ...)
+ map< utime_t, set<Context*> > scheduled; // time -> (context ...)
hash_map< Context*, utime_t > event_times; // event -> time
// get time of the next event
- Context* get_next_scheduled(utime_t& when) {
- if (scheduled.empty()) return 0;
- map< utime_t, multiset<Context*> >::iterator it = scheduled.begin();
- when = it->first;
- multiset<Context*>::iterator sit = it->second.begin();
- return *sit;
- }
+ //Context* get_next_scheduled(utime_t& when);
+
+ bool get_next_due(utime_t &when);
void register_timer(); // make sure i get a callback
void cancel_timer(); // make sure i get a callback
cancel_timer();
// scheduled
- for (map< utime_t, multiset<Context*> >::iterator it = scheduled.begin();
+ for (map< utime_t, set<Context*> >::iterator it = scheduled.begin();
it != scheduled.end();
it++) {
- for (multiset<Context*>::iterator sit = it->second.begin();
+ for (set<Context*>::iterator sit = it->second.begin();
sit != it->second.end();
sit++)
delete *sit;
};
+/*
+ * SafeTimer is a wrapper around the raw Timer (or rather, g_timer, it's global
+ * instantiation) that protects event execution with an existing mutex. It
+ * provides for, among other things, reliable event cancellation on class
+ * destruction. The caller just needs to cancel each event (or cancel_all()),
+ * and then call join() to ensure any concurrently exectuting events (in other
+ * threads) get flushed.
+ */
+class SafeTimer {
+ Mutex& lock;
+ Cond cond;
+ map<Context*,Context*> scheduled; // actual -> wrapper
+ map<Context*,Context*> canceled;
+
+ class EventWrapper : public Context {
+ SafeTimer *timer;
+ Context *actual;
+ public:
+ EventWrapper(SafeTimer *st, Context *c) : timer(st),
+ actual(c) {}
+ void finish(int r);
+ };
+
+public:
+ SafeTimer(Mutex& l) : lock(l) { }
+ ~SafeTimer();
+
+ void add_event_after(float seconds, Context *c);
+ void add_event_at(utime_t when, Context *c);
+ void cancel_event(Context *c);
+ void cancel_all();
+ void join();
+
+ int get_num_scheduled() { return scheduled.size(); }
+ int get_num_canceled() { return canceled.size(); }
+};
+
+
// single global instance
extern Timer g_timer;
std::map<int,float> g_fake_osd_down;
std::map<int,float> g_fake_osd_out;
+entity_addr_t g_my_addr;
+
md_config_t g_debug_after_conf;
md_config_t g_conf = {
mon_tick_interval: 5,
mon_osd_down_out_interval: 5, // seconds
mon_lease: 2.000, // seconds
+ mon_stop_with_last_mds: true,
// --- client ---
client_cache_size: 300,
objecter_buffer_uncommitted: true,
// --- journaler ---
- journaler_allow_split_entries: false,
+ journaler_allow_split_entries: true,
// --- mds ---
mds_cache_size: MDS_CACHE_SIZE,
mds_decay_halflife: 30,
+ mds_beacon_interval: 5.0,
+ mds_beacon_grace: 10.0,
+
mds_log: true,
mds_log_max_len: MDS_CACHE_SIZE / 3,
mds_log_max_trimming: 10000,
mds_log_pad_entry: 128,//256,//64,
mds_log_before_reply: true,
mds_log_flush_on_shutdown: true,
-
+ mds_log_import_map_interval: 1024*1024, // frequency (in bytes) of EImportMap in log
mds_bal_replicate_threshold: 2000,
mds_bal_unreplicate_threshold: 0,//500,
mds_bal_hash_rd: 10000,
mds_commit_on_shutdown: true,
mds_shutdown_check: 0, //30,
+ mds_shutdown_on_last_unmount: true,
mds_verify_export_dirauth: true,
argv[argc++] = args[i];
}
+bool parse_ip_port(const char *s, entity_addr_t& a)
+{
+ int count = 0; // digit count
+ int off = 0;
+
+ while (1) {
+ // parse the #.
+ int val = 0;
+ int numdigits = 0;
+
+ while (*s >= '0' && *s <= '9') {
+ int digit = *s - '0';
+ //cout << "digit " << digit << endl;
+ val *= 10;
+ val += digit;
+ numdigits++;
+ s++; off++;
+ }
+ //cout << "val " << val << endl;
+
+ if (numdigits == 0) {
+ cerr << "no digits at off " << off << endl;
+ return false; // no digits
+ }
+ if (count < 3 && *s != '.') {
+ cerr << "should period at " << off << endl;
+ return false; // should have 3 periods
+ }
+ if (count == 3 && *s != ':') {
+ cerr << "expected : at " << off << endl;
+ return false; // then a colon
+ }
+ s++; off++;
+
+ if (count <= 3)
+ a.ipq[count] = val;
+ else
+ a.port = val;
+
+ count++;
+ if (count == 5) break;
+ }
+
+ return true;
+}
+
+
+
void parse_config_options(std::vector<char*>& args)
{
std::vector<char*> nargs;
for (unsigned i=0; i<args.size(); i++) {
- if (strcmp(args[i], "--nummon") == 0)
+ if (strcmp(args[i],"--bind") == 0)
+ assert(parse_ip_port(args[++i], g_my_addr));
+ else if (strcmp(args[i], "--nummon") == 0)
g_conf.num_mon = atoi(args[++i]);
else if (strcmp(args[i], "--nummds") == 0)
g_conf.num_mds = atoi(args[++i]);
g_conf.mds_commit_on_shutdown = atoi(args[++i]);
else if (strcmp(args[i], "--mds_shutdown_check") == 0)
g_conf.mds_shutdown_check = atoi(args[++i]);
+ else if (strcmp(args[i], "--mds_shutdown_on_last_unmount") == 0)
+ g_conf.mds_shutdown_on_last_unmount = atoi(args[++i]);
else if (strcmp(args[i], "--mds_log_flush_on_shutdown") == 0)
g_conf.mds_log_flush_on_shutdown = atoi(args[++i]);
else if (strcmp(args[i], "--mds_local_osd") == 0)
g_conf.mds_local_osd = atoi(args[++i]);
-
+
+ else if (strcmp(args[i], "--client_use_random_mds") == 0)
+ g_conf.client_use_random_mds = true;
else if (strcmp(args[i], "--client_cache_size") == 0)
g_conf.client_cache_size = atoi(args[++i]);
else if (strcmp(args[i], "--client_cache_stat_ttl") == 0)
else if (strcmp(args[i], "--mon_osd_down_out_interval") == 0)
g_conf.mon_osd_down_out_interval = atoi(args[++i]);
+ else if (strcmp(args[i], "--mon_stop_with_last_mds") == 0)
+ g_conf.mon_stop_with_last_mds = atoi(args[++i]);
else if (strcmp(args[i], "--client_sync_writes") == 0)
g_conf.client_sync_writes = atoi(args[++i]);
#define OSD_REP_SPLAY 1
#define OSD_REP_CHAIN 2
+
+#include "msg/msg_types.h"
+
+extern entity_addr_t g_my_addr;
+
struct md_config_t {
int num_mon;
int num_mds;
int mon_tick_interval;
int mon_osd_down_out_interval;
float mon_lease;
+ bool mon_stop_with_last_mds;
// client
int client_cache_size;
float mds_decay_halflife;
+ float mds_beacon_interval;
+ float mds_beacon_grace;
+
bool mds_log;
int mds_log_max_len;
int mds_log_max_trimming;
int mds_log_pad_entry;
bool mds_log_before_reply;
bool mds_log_flush_on_shutdown;
+ off_t mds_log_import_map_interval;
float mds_bal_replicate_threshold;
float mds_bal_unreplicate_threshold;
bool mds_commit_on_shutdown;
int mds_shutdown_check;
+ bool mds_shutdown_on_last_unmount;
bool mds_verify_export_dirauth; // debug flag
bool mds_local_osd;
void parse_config_options(std::vector<char*>& args);
+extern bool parse_ip_port(const char *s, entity_addr_t& addr);
+
+
+
#endif
if (g_conf.debug_after)
g_timer.add_event_after(g_conf.debug_after, new C_Debug);
-
+ // osd specific args
char *dev;
int whoami = -1;
for (unsigned i=0; i<args.size(); i++) {
+- LogEvent.replay() is idempotent. we won't know whether the update is old or not.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
journal is distributed among different nodes. because authority changes over time, it's not immedicatley clear to a recoverying node relaying the journal whether the data is "real" or not (it might be exported later in the journal).
}
};
+class C_Die : public Context {
+public:
+ void finish(int) {
+ cerr << "die" << endl;
+ exit(1);
+ }
+};
+
int main(int argc, char **argv)
{
assert(nargs.empty());
+ if (g_conf.kill_after)
+ g_timer.add_event_after(g_conf.kill_after, new C_Die);
+
+
g_clock.tare();
MonMap *monmap = new MonMap(g_conf.num_mon);
- monmap->mon_inst[0].rank = 0; // hack ; see FakeMessenger.cc
-
+ entity_addr_t a;
+ monmap->mon_inst[0] = entity_inst_t(MSG_ADDR_MON(0), a); // hack ; see FakeMessenger.cc
+
char hostname[100];
gethostname(hostname,100);
//int pid = getpid();
// encoder/decode helpers
+// -- basic types --
// string
inline void _encode(const std::string& s, bufferlist& bl)
{
off += len;
}
+
#include <set>
#include <map>
#include <vector>
#include <string>
+// set<string>
+inline void _encode(const std::set<std::string>& s, bufferlist& bl)
+{
+ int n = s.size();
+ bl.append((char*)&n, sizeof(n));
+ for (std::set<std::string>::const_iterator it = s.begin();
+ it != s.end();
+ it++) {
+ ::_encode(*it, bl);
+ n--;
+ }
+ assert(n==0);
+}
+inline void _decode(std::set<std::string>& s, bufferlist& bl, int& off)
+{
+ s.clear();
+ int n;
+ bl.copy(off, sizeof(n), (char*)&n);
+ off += sizeof(n);
+ for (int i=0; i<n; i++) {
+ std::string v;
+ ::_decode(v, bl, off);
+ s.insert(v);
+ }
+ assert(s.size() == (unsigned)n);
+}
+
+// list<bufferlist>
+inline void _encode(const std::list<bufferlist>& s, bufferlist& bl)
+{
+ int n = s.size();
+ bl.append((char*)&n, sizeof(n));
+ for (std::list<bufferlist>::const_iterator it = s.begin();
+ it != s.end();
+ it++) {
+ ::_encode(*it, bl);
+ n--;
+ }
+ assert(n==0);
+}
+inline void _decode(std::list<bufferlist>& s, bufferlist& bl, int& off)
+{
+ s.clear();
+ int n;
+ bl.copy(off, sizeof(n), (char*)&n);
+ off += sizeof(n);
+ for (int i=0; i<n; i++) {
+ bufferlist v;
+ s.push_back(v);
+ ::_decode(s.back(), bl, off);
+ }
+ //assert(s.size() == (unsigned)n);
+}
+
+
// set<T>
template<class T>
-inline void _encode(std::set<T>& s, bufferlist& bl)
+inline void _encode(const std::set<T>& s, bufferlist& bl)
{
int n = s.size();
bl.append((char*)&n, sizeof(n));
- for (typename std::set<T>::iterator it = s.begin();
+ for (typename std::set<T>::const_iterator it = s.begin();
it != s.end();
it++) {
T v = *it;
assert(s.size() == (unsigned)n);
}
+// map<string,U>
+template<class U>
+inline void _encode(const std::map<std::string, U>& s, bufferlist& bl)
+{
+ int n = s.size();
+ bl.append((char*)&n, sizeof(n));
+ for (typename std::map<std::string, U>::const_iterator it = s.begin();
+ it != s.end();
+ it++) {
+ ::_encode(it->first, bl);
+ U v = it->second;
+ bl.append((char*)&v, sizeof(v));
+ n--;
+ }
+ assert(n==0);
+}
+template<class U>
+inline void _decode(std::map<std::string,U>& s, bufferlist& bl, int& off)
+{
+ s.clear();
+ int n;
+ bl.copy(off, sizeof(n), (char*)&n);
+ off += sizeof(n);
+ for (int i=0; i<n; i++) {
+ std::string k;
+ U v;
+ ::_decode(k, bl, off);
+ bl.copy(off, sizeof(v), (char*)&v);
+ off += sizeof(v);
+ s[k] = v;
+ }
+ assert(s.size() == (unsigned)n);
+}
+
+// map<T,set<U>>
+template<class T, class U>
+inline void _encode(const std::map<T, std::set<U> >& s, bufferlist& bl)
+{
+ int n = s.size();
+ bl.append((char*)&n, sizeof(n));
+ for (typename std::map<T, std::set<U> >::const_iterator it = s.begin();
+ it != s.end();
+ it++) {
+ T k = it->first;
+ bl.append((char*)&k, sizeof(k));
+ ::_encode(it->second, bl);
+ n--;
+ }
+ assert(n==0);
+}
+template<class T, class U>
+inline void _decode(std::map<T, std::set<U> >& s, bufferlist& bl, int& off)
+{
+ s.clear();
+ int n;
+ bl.copy(off, sizeof(n), (char*)&n);
+ off += sizeof(n);
+ for (int i=0; i<n; i++) {
+ T k;
+ bl.copy(off, sizeof(k), (char*)&k);
+ off += sizeof(k);
+ ::_decode(s[k], bl, off);
+ }
+ assert(s.size() == (unsigned)n);
+}
+
+
// map<T,U>
template<class T, class U>
inline void _encode(const std::map<T, U>& s, bufferlist& bl)
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef __REQID_H
+#define __REQID_H
+
+
+#include "include/types.h"
+#include "msg/msg_types.h"
+
+/* reqid_t - caller name + incarnation# + tid to unique identify this request
+ * use for metadata and osd ops.
+ */
+class reqid_t {
+public:
+ entity_name_t name; // who
+ int inc; // incarnation
+ tid_t tid;
+ reqid_t() : inc(0), tid(0) {}
+ reqid_t(const entity_name_t& a, int i, tid_t t) : name(a), inc(i), tid(t) {}
+};
+
+inline ostream& operator<<(ostream& out, const reqid_t& r) {
+ return out << r.name << "." << r.inc << ":" << r.tid;
+}
+
+inline bool operator==(const reqid_t& l, const reqid_t& r) {
+ return (l.name == r.name) && (l.inc == r.inc) && (l.tid == r.tid);
+}
+inline bool operator!=(const reqid_t& l, const reqid_t& r) {
+ return (l.name != r.name) || (l.inc != r.inc) || (l.tid != r.tid);
+}
+inline bool operator<(const reqid_t& l, const reqid_t& r) {
+ return (l.name < r.name) || (l.inc < r.inc) ||
+ (l.name == r.name && l.inc == r.inc && l.tid < r.tid);
+}
+inline bool operator<=(const reqid_t& l, const reqid_t& r) {
+ return (l.name < r.name) || (l.inc < r.inc) ||
+ (l.name == r.name && l.inc == r.inc && l.tid <= r.tid);
+}
+inline bool operator>(const reqid_t& l, const reqid_t& r) { return !(l <= r); }
+inline bool operator>=(const reqid_t& l, const reqid_t& r) { return !(l < r); }
+
+namespace __gnu_cxx {
+ template<> struct hash<reqid_t> {
+ size_t operator()(const reqid_t &r) const {
+ static blobhash H;
+ return H((const char*)&r, sizeof(r));
+ }
+ };
+}
+
+
+#endif
#include <ext/rope>
using namespace __gnu_cxx;
-
#include "object.h"
-
#ifndef MIN
# define MIN(a,b) ((a) < (b) ? (a):(b))
#endif
#endif
-// md ops
-#define MDS_OP_STATFS 1
-
-#define MDS_OP_STAT 100
-#define MDS_OP_LSTAT 101
-#define MDS_OP_UTIME 102
-#define MDS_OP_CHMOD 103
-#define MDS_OP_CHOWN 104
-
-
-#define MDS_OP_READDIR 200
-#define MDS_OP_MKNOD 201
-#define MDS_OP_LINK 202
-#define MDS_OP_UNLINK 203
-#define MDS_OP_RENAME 204
-
-#define MDS_OP_MKDIR 220
-#define MDS_OP_RMDIR 221
-#define MDS_OP_SYMLINK 222
-
-#define MDS_OP_OPEN 301
-#define MDS_OP_TRUNCATE 306
-#define MDS_OP_FSYNC 307
-//#define MDS_OP_CLOSE 310
-#define MDS_OP_RELEASE 308
-
-
-
// -- stl crap --
/*
compile now?
*/
+class blobhash {
+public:
+ size_t operator()(const char *p, unsigned len) {
+ static hash<long> H;
+ long acc = 0;
+ while (len >= sizeof(long)) {
+ acc ^= *(long*)p;
+ p += sizeof(long);
+ len -= sizeof(long);
+ }
+ int sh = 0;
+ while (len) {
+ acc ^= (long)*p << sh;
+ sh += 8;
+ len--;
+ p++;
+ }
+ return H(acc);
+ }
+};
+
+
namespace __gnu_cxx {
template<> struct hash< std::string >
{
+// ----------------------
+// some basic types
+
+typedef __uint64_t tid_t; // transaction id
+typedef __uint64_t version_t;
+typedef __uint32_t epoch_t; // map epoch (32bits -> 13 epochs/second for 10 years)
+
+
+
+
+
/** object layout
* how objects are mapped into PGs
*/
// -- inode --
-//typedef __uint64_t inodeno_t;
-
struct inodeno_t {
__uint64_t val;
inodeno_t() : val() {}
};
}
-typedef __uint64_t version_t;
-
-
#define INODE_MODE_FILE 0100000 // S_IFREG
#define INODE_MODE_SYMLINK 0120000 // S_IFLNK
-// lame 128-bit value class.
-class lame128_t {
-public:
- __uint64_t hi, lo;
- lame128_t(__uint64_t h=0, __uint64_t l=0) : hi(h), lo(l) {}
-};
-
-inline ostream& operator<<(ostream& out, lame128_t& oid) {
- return out << oid.hi << "." << oid.lo;
-}
-
-
-// osd types
-//typedef __uint32_t ps_t; // placement seed
-//typedef __uint32_t pg_t; // placement group
-typedef __uint64_t coll_t; // collection id
-typedef __uint64_t tid_t; // transaction id
-
-typedef __uint32_t epoch_t; // map epoch (32bits -> 13 epochs/second for 10 years)
-
-// pg stuff
-typedef __uint16_t ps_t;
-typedef __uint8_t pruleset_t;
-
-// placement group id
-struct pg_t {
- union {
- struct {
- int preferred;
- ps_t ps;
- __uint8_t nrep;
- pruleset_t ruleset;
- } fields;
- __uint64_t val;
- } u;
- pg_t() { u.val = 0; }
- pg_t(const pg_t& o) { u.val = o.u.val; }
- pg_t(ps_t s, int p, unsigned char n, pruleset_t r=0) {
- u.fields.ps = s;
- u.fields.preferred = p;
- u.fields.nrep = n;
- u.fields.ruleset = r;
- }
- pg_t(__uint64_t v) { u.val = v; }
- /*
- pg_t operator=(__uint64_t v) { u.val = v; return *this; }
- pg_t operator&=(__uint64_t v) { u.val &= v; return *this; }
- pg_t operator+=(pg_t o) { u.val += o.val; return *this; }
- pg_t operator-=(pg_t o) { u.val -= o.val; return *this; }
- pg_t operator++() { ++u.val; return *this; }
- */
- operator __uint64_t() const { return u.val; }
-};
-
-inline ostream& operator<<(ostream& out, pg_t pg) {
- //return out << hex << pg.val << dec;
- if (pg.u.fields.ruleset)
- out << (int)pg.u.fields.ruleset << '.';
- out << (int)pg.u.fields.nrep << '.';
- if (pg.u.fields.preferred)
- out << pg.u.fields.preferred << '.';
- out << hex << pg.u.fields.ps << dec;
- return out;
-}
-
-namespace __gnu_cxx {
- template<> struct hash< pg_t >
- {
- size_t operator()( const pg_t& x ) const
- {
- static hash<__uint64_t> H;
- return H(x);
- }
- };
-}
-
-
-
-// compound rados version type
-class eversion_t {
-public:
- epoch_t epoch;
- version_t version;
- eversion_t(epoch_t e=0, version_t v=0) : epoch(e), version(v) {}
-};
-
-inline bool operator==(const eversion_t& l, const eversion_t& r) {
- return (l.epoch == r.epoch) && (l.version == r.version);
-}
-inline bool operator!=(const eversion_t& l, const eversion_t& r) {
- return (l.epoch != r.epoch) || (l.version != r.version);
-}
-inline bool operator<(const eversion_t& l, const eversion_t& r) {
- return (l.epoch == r.epoch) ? (l.version < r.version):(l.epoch < r.epoch);
-}
-inline bool operator<=(const eversion_t& l, const eversion_t& r) {
- return (l.epoch == r.epoch) ? (l.version <= r.version):(l.epoch <= r.epoch);
-}
-inline bool operator>(const eversion_t& l, const eversion_t& r) {
- return (l.epoch == r.epoch) ? (l.version > r.version):(l.epoch > r.epoch);
-}
-inline bool operator>=(const eversion_t& l, const eversion_t& r) {
- return (l.epoch == r.epoch) ? (l.version >= r.version):(l.epoch >= r.epoch);
-}
-inline ostream& operator<<(ostream& out, const eversion_t e) {
- return out << e.epoch << "'" << e.version;
-}
-
-
-
-#define PG_NONE 0xffffffffL
-
-
-typedef __uint16_t snapv_t; // snapshot version
-
-
-class OSDSuperblock {
-public:
- const static __uint64_t MAGIC = 0xeb0f505dULL;
- __uint64_t magic;
- __uint64_t fsid; // unique fs id (random number)
- int whoami; // my role in this fs.
- epoch_t current_epoch; // most recent epoch
- epoch_t oldest_map, newest_map; // oldest/newest maps we have.
- OSDSuperblock(__uint64_t f=0, int w=0) :
- magic(MAGIC), fsid(f), whoami(w),
- current_epoch(0), oldest_map(0), newest_map(0) {}
-};
-
-inline ostream& operator<<(ostream& out, OSDSuperblock& sb)
-{
- return out << "sb(fsid " << sb.fsid
- << " osd" << sb.whoami
- << " e" << sb.current_epoch
- << " [" << sb.oldest_map << "," << sb.newest_map
- << "])";
-}
-
-class MonSuperblock {
-public:
- const static __uint64_t MAGIC = 0x00eb0f5000ULL;
- __uint64_t magic;
- __uint64_t fsid;
- int whoami; // mon #
- epoch_t current_epoch;
- MonSuperblock(__uint64_t f=0, int w=0) :
- magic(MAGIC), fsid(f), whoami(w), current_epoch(0) {}
-};
-
-
-// new types
-
-class ObjectExtent {
- public:
- object_t oid; // object id
- off_t start; // in object
- size_t length; // in object
-
- objectrev_t rev; // which revision?
- pg_t pgid; // where to find the object
-
- map<size_t, size_t> buffer_extents; // off -> len. extents in buffer being mapped (may be fragmented bc of striping!)
-
- ObjectExtent() : start(0), length(0), rev(0), pgid(0) {}
- ObjectExtent(object_t o, off_t s=0, size_t l=0) : oid(o), start(s), length(l), rev(0), pgid(0) { }
-};
-
-inline ostream& operator<<(ostream& out, ObjectExtent &ex)
-{
- return out << "extent("
- << ex.oid << " in " << hex << ex.pgid << dec
- << " " << ex.start << "~" << ex.length
- << ")";
-}
-
-
// client types
typedef int fh_t; // file handle
-
-
// -- io helpers --
template<class A>
pending_lookup_context[ino] = onfinish;
messenger->send_message(req,
- MSG_ADDR_MDS(mdsmap->get_anchortable()), mdsmap->get_inst(mdsmap->get_anchortable()),
+ mdsmap->get_inst(mdsmap->get_anchortable()),
MDS_PORT_ANCHORMGR, MDS_PORT_ANCHORCLIENT);
}
pending_op[ino] = onfinish;
messenger->send_message(req,
- MSG_ADDR_MDS(mdsmap->get_anchortable()), mdsmap->get_inst(mdsmap->get_anchortable()),
+ mdsmap->get_inst(mdsmap->get_anchortable()),
MDS_PORT_ANCHORMGR, MDS_PORT_ANCHORCLIENT);
}
pending_op[ino] = onfinish;
messenger->send_message(req,
- MSG_ADDR_MDS(mdsmap->get_anchortable()), mdsmap->get_inst(mdsmap->get_anchortable()),
+ mdsmap->get_inst(mdsmap->get_anchortable()),
MDS_PORT_ANCHORMGR, MDS_PORT_ANCHORCLIENT);
}
pending_op[ino] = onfinish;
messenger->send_message(req,
- MSG_ADDR_MDS(mdsmap->get_anchortable()), mdsmap->get_inst(mdsmap->get_anchortable()),
+ mdsmap->get_inst(mdsmap->get_anchortable()),
MDS_PORT_ANCHORMGR, MDS_PORT_ANCHORCLIENT);
}
}
// send reply
- mds->messenger->send_message(reply, m->get_source(), m->get_source_inst(), m->get_source_port());
+ mds->messenger->send_message(reply, m->get_source_inst(), m->get_source_port());
delete m;
}
#include "CInode.h"
#include "CDir.h"
+#include "MDS.h"
+#include "MDCache.h"
+
#include <cassert>
#undef dout
-#define dout(x) if ((x) <= g_conf.debug) cout << "mds.dentry "
+#define dout(x) if (x <= g_conf.debug || x <= g_conf.debug_mds) cout << g_clock.now() << " mds" << dir->cache->mds->get_nodeid() << ".cache.den(" << dir->ino() << " " << name << ") "
// CDentry
ostream& operator<<(ostream& out, CDentry& dn)
{
- out << "[dentry " << dn.get_name();
- if (dn.is_pinned()) out << " " << dn.num_pins() << " pins";
+ string path;
+ dn.make_path(path);
+ out << "[dentry " << path;
+ if (dn.is_auth()) {
+ out << " auth";
+ if (dn.is_replicated())
+ out << dn.get_replicas();
+ } else {
+ out << " rep@" << dn.authority();
+ out << "." << dn.get_replica_nonce();
+ assert(dn.get_replica_nonce() >= 0);
+ }
+
if (dn.is_null()) out << " NULL";
if (dn.is_remote()) out << " REMOTE";
+ if (dn.is_pinned()) out << " " << dn.num_pins() << " pathpins";
+
if (dn.get_lockstate() == DN_LOCK_UNPINNING) out << " unpinning";
- if (dn.is_dirty()) out << " dirty";
if (dn.get_lockstate() == DN_LOCK_PREXLOCK) out << " prexlock=" << dn.get_xlockedby() << " g=" << dn.get_gather_set();
if (dn.get_lockstate() == DN_LOCK_XLOCK) out << " xlock=" << dn.get_xlockedby();
- out << " dirv=" << dn.get_parent_dir_version();
+ out << " v=" << dn.get_version();
+ out << " pv=" << dn.get_projected_version();
out << " inode=" << dn.get_inode();
+
+ if (dn.get_num_ref()) {
+ out << " |";
+ for(set<int>::iterator it = dn.get_ref_set().begin();
+ it != dn.get_ref_set().end();
+ it++)
+ out << " " << CDentry::pin_name(*it);
+ }
+
out << " " << &dn;
- out << " in " << *dn.get_dir();
out << "]";
return out;
}
}
-void CDentry::mark_dirty()
+inodeno_t CDentry::get_ino()
{
- dout(10) << " mark_dirty " << *this << endl;
+ if (inode)
+ return inode->ino();
+ return inodeno_t();
+}
+
- // dir is now dirty (if it wasn't already)
- dir->mark_dirty();
+int CDentry::authority()
+{
+ return dir->dentry_authority( name );
+}
- // pin inode?
- if (is_primary() && !dirty && inode) inode->get(CINODE_PIN_DNDIRTY);
-
- // i now live in that (potentially newly dirty) version
- parent_dir_version = dir->get_version();
- dirty = true;
+version_t CDentry::pre_dirty()
+{
+ // NOTE: in the future, this will dirty a particular slice/subset of the dir.
+ projected_version = dir->pre_dirty();
+ dout(10) << " pre_dirty " << *this << endl;
+ return projected_version;
}
+
+
+void CDentry::_mark_dirty()
+{
+ // state+pin
+ if (!state_test(STATE_DIRTY)) {
+ state_set(STATE_DIRTY);
+ get(PIN_DIRTY);
+ }
+}
+
+void CDentry::mark_dirty(version_t pv)
+{
+ dout(10) << " mark_dirty " << *this << endl;
+
+ // i now live in this new dir version
+ assert(pv == projected_version);
+ version = pv;
+ _mark_dirty();
+
+ // mark dir too
+ dir->mark_dirty(pv);
+}
+
void CDentry::mark_clean() {
dout(10) << " mark_clean " << *this << endl;
- assert(dirty);
- assert(parent_dir_version <= dir->get_version());
-
- if (parent_dir_version < dir->get_last_committed_version())
- cerr << " bad mark_clean " << *this << endl;
+ assert(is_dirty());
+ assert(version <= dir->get_version());
- assert(parent_dir_version >= dir->get_last_committed_version());
+ // this happens on export.
+ //assert(version <= dir->get_last_committed_version());
- if (is_primary() && dirty && inode) inode->put(CINODE_PIN_DNDIRTY);
- dirty = false;
+ // state+pin
+ state_clear(STATE_DIRTY);
+ put(PIN_DIRTY);
}
void CDentry::make_path(string& s)
{
- if (dir->inode->get_parent_dn())
- dir->inode->get_parent_dn()->make_path(s);
-
+ if (dir) {
+ if (dir->inode->get_parent_dn())
+ dir->inode->get_parent_dn()->make_path(s);
+ } else {
+ s = "???";
+ }
s += "/";
s += name;
}
}
+CDentryDiscover *CDentry::replicate_to(int who)
+{
+ int nonce = add_replica(who);
+ return new CDentryDiscover(this, nonce);
+}
+
using namespace std;
#include "include/types.h"
+#include "include/buffer.h"
+#include "include/lru.h"
+#include "mdstypes.h"
class CInode;
class CDir;
#define DN_LOCK_SYNC 0
#define DN_LOCK_PREXLOCK 1
#define DN_LOCK_XLOCK 2
-#define DN_LOCK_UNPINNING 3 // waiting for pins to go away
+#define DN_LOCK_UNPINNING 3 // waiting for pins to go away .. FIXME REVIEW THIS CODE ..
#define DN_XLOCK_FOREIGN ((Message*)0x1) // not 0, not a valid pointer.
class Message;
+class CDentryDiscover;
// dentry
-class CDentry {
+class CDentry : public MDSCacheObject, public LRUObject {
+ public:
+ // state
+ static const int STATE_AUTH = (1<<0);
+ static const int STATE_DIRTY = (1<<1);
+
+ // pins
+ static const int PIN_INODEPIN = 0; // linked inode is pinned
+ static const int PIN_REPLICATED = 1; // replicated by another MDS
+ static const int PIN_DIRTY = 2; //
+ static const int PIN_PROXY = 3; //
+ static const char *pin_name(int p) {
+ switch (p) {
+ case PIN_INODEPIN: return "inodepin";
+ case PIN_REPLICATED: return "replicated";
+ case PIN_DIRTY: return "dirty";
+ case PIN_PROXY: return "proxy";
+ default: assert(0);
+ }
+ };
+
+
protected:
string name;
CInode *inode;
inodeno_t remote_ino; // if remote dentry
- // state
- bool dirty;
- version_t parent_dir_version; // dir version when last touched.
+ version_t version; // dir version when last touched.
+ version_t projected_version; // what it will be when i unlock/commit.
// locking
int lockstate;
Message *xlockedby;
set<int> gather_set;
+ // path pins
int npins;
multiset<Message*> pinset;
inode(0),
dir(0),
remote_ino(0),
- dirty(0),
- parent_dir_version(0),
+ version(0),
+ projected_version(0),
lockstate(DN_LOCK_SYNC),
xlockedby(0),
npins(0) { }
inode(in),
dir(0),
remote_ino(ino),
- dirty(0),
- parent_dir_version(0),
+ version(0),
+ projected_version(0),
lockstate(DN_LOCK_SYNC),
xlockedby(0),
npins(0) { }
inode(in),
dir(0),
remote_ino(0),
- dirty(0),
- parent_dir_version(0),
+ version(0),
+ projected_version(0),
lockstate(DN_LOCK_SYNC),
xlockedby(0),
npins(0) { }
CInode *get_inode() { return inode; }
CDir *get_dir() { return dir; }
const string& get_name() { return name; }
+ inodeno_t get_ino();
inodeno_t get_remote_ino() { return remote_ino; }
void set_remote_ino(inodeno_t ino) { remote_ino = ino; }
+
+ // ref counts: pin ourselves in the LRU when we're pinned.
+ void first_get() {
+ lru_pin();
+ }
+ void last_put() {
+ lru_unpin();
+ }
+
+
// dentry type is primary || remote || null
// inode ptr is required for primary, optional for remote, undefined for null
bool is_primary() { return remote_ino == 0 && inode != 0; }
void make_path(string& p);
// -- state
- __uint64_t get_parent_dir_version() { return parent_dir_version; }
- void float_parent_dir_version(__uint64_t ge) {
- if (parent_dir_version < ge)
- parent_dir_version = ge;
- }
+ version_t get_version() { return version; }
+ void set_version(version_t v) { projected_version = version = v; }
+ version_t get_projected_version() { return projected_version; }
+ void set_projected_version(version_t v) { projected_version = v; }
- bool is_dirty() { return dirty; }
- bool is_clean() { return !dirty; }
+ int authority();
+
+ bool is_auth() { return state & STATE_AUTH; }
+ bool is_dirty() { return state & STATE_DIRTY; }
+ bool is_clean() { return !is_dirty(); }
- void mark_dirty();
+ version_t pre_dirty();
+ void _mark_dirty();
+ void mark_dirty(version_t projected_dirv);
void mark_clean();
+
+ // -- replication
+ CDentryDiscover *replicate_to(int rep);
+
// -- locking
int get_lockstate() { return lockstate; }
bool is_prexlockbyother(Message *m) {
return (lockstate == DN_LOCK_PREXLOCK) && m != xlockedby;
}
+
+ int get_replica_lockstate() {
+ switch (lockstate) {
+ case DN_LOCK_XLOCK:
+ case DN_LOCK_SYNC:
+ return lockstate;
+ case DN_LOCK_PREXLOCK:
+ return DN_LOCK_XLOCK;
+ case DN_LOCK_UNPINNING:
+ return DN_LOCK_SYNC;
+ }
+ assert(0);
+ return 0;
+ }
+ void set_lockstate(int s) { lockstate = s; }
- // pins
+ // path pins
void pin(Message *m) {
npins++;
pinset.insert(m);
ostream& operator<<(ostream& out, CDentry& dn);
+class CDentryDiscover {
+ string dname;
+ int replica_nonce;
+ int lockstate;
+
+ inodeno_t ino;
+ inodeno_t remote_ino;
+
+public:
+ CDentryDiscover() {}
+ CDentryDiscover(CDentry *dn, int nonce) :
+ dname(dn->get_name()), replica_nonce(nonce),
+ lockstate(dn->get_replica_lockstate()),
+ ino(dn->get_ino()),
+ remote_ino(dn->get_remote_ino()) { }
+
+ string& get_dname() { return dname; }
+ int get_nonce() { return replica_nonce; }
+
+ void update_dentry(CDentry *dn) {
+ dn->set_replica_nonce( replica_nonce );
+ dn->set_lockstate( lockstate );
+ }
+
+ void _encode(bufferlist& bl) {
+ ::_encode(dname, bl);
+ bl.append((char*)&replica_nonce, sizeof(replica_nonce));
+ bl.append((char*)&lockstate, sizeof(lockstate));
+ }
+
+ void _decode(bufferlist& bl, int& off) {
+ ::_decode(dname, bl, off);
+ bl.copy(off, sizeof(replica_nonce), (char*)&replica_nonce);
+ off += sizeof(replica_nonce);
+ bl.copy(off, sizeof(lockstate), (char*)&lockstate);
+ off += sizeof(lockstate);
+ }
+
+};
+
+
#endif
#include "CInode.h"
#include "MDS.h"
+#include "MDCache.h"
#include "MDSMap.h"
#include "include/Context.h"
#include "config.h"
#undef dout
-#define dout(x) if (x <= g_conf.debug || x <= g_conf.debug_mds) cout << g_clock.now() << " mds" << mds->get_nodeid() << ".cache.dir(" << inode->inode.ino << ") "
+#define dout(x) if (x <= g_conf.debug || x <= g_conf.debug_mds) cout << g_clock.now() << " mds" << cache->mds->get_nodeid() << ".cache.dir(" << inode->inode.ino << ") "
// PINS
-int cdir_pins[CDIR_NUM_PINS] = { 0,0,0,0,0,0,0,0,0,0,0,0,0,0 };
-
-static char* cdir_pin_names[CDIR_NUM_PINS] = {
- "child",
- "opened",
- "waiter",
- "import",
- "export",
- "freeze",
- "proxy",
- "authpin",
- "imping",
- "impex",
- "hashed",
- "hashing",
- "dirty",
- "reqpins"
-};
+//int cdir_pins[CDIR_NUM_PINS] = { 0,0,0,0,0,0,0,0,0,0,0,0,0,0 };
+
ostream& operator<<(ostream& out, CDir& dir)
string path;
dir.get_inode()->make_path(path);
out << "[dir " << dir.ino() << " " << path << "/";
- if (dir.is_dirty()) out << " dirty";
- if (dir.is_import()) out << " import";
- if (dir.is_export()) out << " export";
- if (dir.is_rep()) out << " repl";
- if (dir.is_hashed()) out << " hashed"; //=" << (int)dir.get_inode()->inode.hash_seed;
if (dir.is_auth()) {
out << " auth";
- if (dir.is_open_by_anyone())
- out << "+" << dir.get_open_by();
+ if (dir.is_replicated())
+ out << dir.get_replicas();
+
+ out << " v=" << dir.get_version();
+ out << " pv=" << dir.get_projected_version();
+ out << " cv=" << dir.get_committing_version();
+ out << " lastcv=" << dir.get_last_committed_version();
} else {
out << " rep@" << dir.authority();
if (dir.get_replica_nonce() > 1)
out << "." << dir.get_replica_nonce();
}
- if (dir.is_pinned()) {
+ if (dir.get_dir_auth() != CDIR_AUTH_PARENT)
+ out << " dir_auth=" << dir.get_dir_auth();
+
+ out << " state=" << dir.get_state();
+ if (dir.state_test(CDIR_STATE_PROXY)) out << "|proxy";
+ if (dir.state_test(CDIR_STATE_COMPLETE)) out << "|complete";
+ if (dir.state_test(CDIR_STATE_FREEZINGTREE)) out << "|freezingtree";
+ if (dir.state_test(CDIR_STATE_FROZENTREE)) out << "|frozentree";
+ if (dir.state_test(CDIR_STATE_FROZENTREELEAF)) out << "|frozentreeleaf";
+ if (dir.state_test(CDIR_STATE_FROZENDIR)) out << "|frozendir";
+ if (dir.state_test(CDIR_STATE_FREEZINGDIR)) out << "|freezingdir";
+
+ out << " sz=" << dir.get_nitems() << "+" << dir.get_nnull();
+
+ if (dir.get_num_ref()) {
out << " |";
for(set<int>::iterator it = dir.get_ref_set().begin();
it != dir.get_ref_set().end();
it++)
- if (*it < CDIR_NUM_PINS)
- out << " " << cdir_pin_names[*it];
- else
- out << " " << *it;
+ out << " " << CDir::pin_name(*it);
}
- if (dir.get_dir_auth() != CDIR_AUTH_PARENT)
- out << " dir_auth=" << dir.get_dir_auth();
-
- out << " state=" << dir.get_state();
- out << " sz=" << dir.get_nitems() << "+" << dir.get_nnull();
-
- out << " v=" << dir.get_version();
- out << " cv=" << dir.get_committing_version();
- out << " lastcv=" << dir.get_last_committed_version();
-
out << " " << &dir;
return out << "]";
}
// -------------------------------------------------------------------
// CDir
-CDir::CDir(CInode *in, MDS *mds, bool auth)
+CDir::CDir(CInode *in, MDCache *mdcache, bool auth)
{
inode = in;
- this->mds = mds;
+ this->cache = mdcache;
nitems = 0;
nnull = 0;
state = CDIR_STATE_INITIAL;
- version = 0;
+ projected_version = version = 0;
committing_version = 0;
last_committed_version = 0;
* linking fun
*/
-CDentry* CDir::add_dentry( const string& dname, inodeno_t ino)
+CDentry* CDir::add_dentry( const string& dname, inodeno_t ino, bool auth)
{
// foreign
assert(lookup(dname) == 0);
// create dentry
CDentry* dn = new CDentry(dname, ino);
+ if (auth)
+ dn->state_set(CDentry::STATE_AUTH);
+ cache->lru.lru_insert_mid(dn);
+
dn->dir = this;
- dn->parent_dir_version = version;
+ dn->version = projected_version;
// add to dir
assert(items.count(dn->name) == 0);
dout(12) << "add_dentry " << *dn << endl;
// pin?
- if (nnull + nitems == 1) get(CDIR_PIN_CHILD);
+ if (nnull + nitems == 1) get(PIN_CHILD);
assert(nnull + nitems == items.size());
assert(nnull == null_items.size());
}
-CDentry* CDir::add_dentry( const string& dname, CInode *in )
+CDentry* CDir::add_dentry( const string& dname, CInode *in, bool auth )
{
// primary
assert(lookup(dname) == 0);
// create dentry
CDentry* dn = new CDentry(dname, in);
+ if (auth)
+ dn->state_set(CDentry::STATE_AUTH);
+ cache->lru.lru_insert_mid(dn);
+
dn->dir = this;
- dn->parent_dir_version = version;
+ dn->version = projected_version;
// add to dir
assert(items.count(dn->name) == 0);
dout(12) << "add_dentry " << *dn << endl;
// pin?
- if (nnull + nitems == 1) get(CDIR_PIN_CHILD);
+ if (nnull + nitems == 1) get(PIN_CHILD);
assert(nnull + nitems == items.size());
assert(nnull == null_items.size());
unlink_inode_work(dn);
} else {
// remove from null list
- assert(null_items.count(dn->name) == 1);
+ assert(null_items.count(dn->name) == 1);
null_items.erase(dn->name);
nnull--;
}
assert(items.count(dn->name) == 1);
items.erase(dn->name);
+ cache->lru.lru_remove(dn);
delete dn;
// unpin?
- if (nnull + nitems == 0) put(CDIR_PIN_CHILD);
+ if (nnull + nitems == 0) put(PIN_CHILD);
assert(nnull + nitems == items.size());
assert(nnull == null_items.size());
void CDir::link_inode( CDentry *dn, inodeno_t ino)
{
- //dout(12) << "link_inode " << *dn << " remote " << ino << endl;
+ dout(12) << "link_inode " << *dn << " remote " << ino << endl;
assert(dn->is_null());
dn->set_remote_ino(ino);
void CDir::link_inode( CDentry *dn, CInode *in )
{
+ dout(12) << "link_inode " << *dn << " " << *in << endl;
assert(!dn->is_remote());
link_inode_work(dn,in);
- //dout(12) << "link_inode " << *dn << " " << *in << endl;
// remove from null list
assert(null_items.count(dn->name) == 1);
nitems++; // adjust dir size
// set dir version
- in->parent_dir_version = get_version();
+ in->inode.version = dn->get_version();
// clear dangling
- in->state_clear(CINODE_STATE_DANGLING);
-
- // dn dirty?
- if (dn->is_dirty()) in->get(CINODE_PIN_DNDIRTY);
+ in->state_clear(CInode::STATE_DANGLING);
+ // pin dentry?
+ if (in->get_num_ref())
+ dn->get(CDentry::PIN_INODEPIN);
+
// adjust auth pin count
if (in->auth_pins + in->nested_auth_pins)
adjust_nested_auth_pins( in->auth_pins + in->nested_auth_pins );
// explicitly define auth
in->dangling_auth = in->authority();
//dout(10) << "unlink_inode " << *in << " dangling_auth now " << in->dangling_auth << endl;
+
+ // unpin dentry?
+ if (in->get_num_ref())
+ dn->put(CDentry::PIN_INODEPIN);
// unlink auth_pin count
if (in->auth_pins + in->nested_auth_pins)
adjust_nested_auth_pins( 0 - (in->auth_pins + in->nested_auth_pins) );
// set dangling flag
- in->state_set(CINODE_STATE_DANGLING);
+ in->state_set(CInode::STATE_DANGLING);
- // dn dirty?
- if (dn->is_dirty()) in->put(CINODE_PIN_DNDIRTY);
-
// detach inode
in->remove_primary_parent(dn);
dn->inode = 0;
const string& dentry,
Context *c) {
if (waiting.empty() && waiting_on_dentry.size() == 0)
- get(CDIR_PIN_WAITER);
+ get(PIN_WAITER);
waiting_on_dentry[ dentry ].insert(pair<int,Context*>(tag,c));
dout(10) << "add_waiter dentry " << dentry << " tag " << tag << " " << c << " on " << *this << endl;
}
// this dir.
if (waiting.empty() && waiting_on_dentry.size() == 0)
- get(CDIR_PIN_WAITER);
+ get(PIN_WAITER);
waiting.insert(pair<int,Context*>(tag,c));
dout(10) << "add_waiter " << tag << " " << c << " on " << *this << endl;
}
// ...whole map?
if (waiting_on_dentry.size() == 0 && waiting.empty())
- put(CDIR_PIN_WAITER);
+ put(PIN_WAITER);
}
/* NOTE: this checks dentry waiters too */
}
if (waiting_on_dentry.size() == 0 && waiting.empty())
- put(CDIR_PIN_WAITER);
+ put(PIN_WAITER);
}
}
// dirty/clean
-void CDir::mark_dirty()
+version_t CDir::pre_dirty()
+{
+ ++projected_version;
+ dout(10) << "pre_dirty " << projected_version << endl;
+ return projected_version;
+}
+
+void CDir::_mark_dirty()
{
if (!state_test(CDIR_STATE_DIRTY)) {
- version++;
state_set(CDIR_STATE_DIRTY);
- dout(10) << "mark_dirty (was clean) " << *this << " new version " << version << endl;
- get(CDIR_PIN_DIRTY);
- }
- else if (state_test(CDIR_STATE_COMMITTING) &&
- committing_version == version) {
- version++; // now dirtier than committing version!
- dout(10) << "mark_dirty (committing) " << *this << " new version " << version << "/" << committing_version << endl;
+ dout(10) << "mark_dirty (was clean) " << *this << " version " << version << endl;
+ get(PIN_DIRTY);
} else {
dout(10) << "mark_dirty (already dirty) " << *this << " version " << version << endl;
}
}
+void CDir::mark_dirty(version_t pv)
+{
+ ++version;
+ assert(pv == version);
+ _mark_dirty();
+}
+
void CDir::mark_clean()
{
dout(10) << "mark_clean " << *this << " version " << version << endl;
if (state_test(CDIR_STATE_DIRTY)) {
state_clear(CDIR_STATE_DIRTY);
- put(CDIR_PIN_DIRTY);
+ put(PIN_DIRTY);
}
}
-// ref counts
-
-void CDir::put(int by) {
- cdir_pins[by]--;
-
- // bad?
- if (ref == 0 || ref_set.count(by) != 1) {
- dout(7) << *this << " bad put by " << by << " " << cdir_pin_names[by] << " was " << ref << " (" << ref_set << ")" << endl;
- assert(ref_set.count(by) == 1);
- assert(ref > 0);
- }
-
- ref--;
- ref_set.erase(by);
- // inode
- if (ref == 0)
- inode->put(CINODE_PIN_DIR);
-
- dout(7) << *this << " put by " << by << " " << cdir_pin_names[by] << " now " << ref << " (" << ref_set << ")" << endl;
+void CDir::first_get()
+{
+ inode->get(CInode::PIN_DIR);
}
-void CDir::get(int by) {
- cdir_pins[by]++;
-
- // inode
- if (ref == 0)
- inode->get(CINODE_PIN_DIR);
-
- // bad?
- if (ref_set.count(by)) {
- dout(7) << *this << " bad get by " << by << " " << cdir_pin_names[by] << " was " << ref << " (" << ref_set << ")" << endl;
- assert(ref_set.count(by) == 0);
- }
-
- ref++;
- ref_set.insert(by);
-
- dout(7) << *this << " get by " << by << " " << cdir_pin_names[by] << " now " << ref << " (" << ref_set << ")" << endl;
+void CDir::last_put()
+{
+ inode->put(CInode::PIN_DIR);
}
*/
int CDir::authority()
{
- if (get_dir_auth() >= 0)
- return get_dir_auth();
-
- /*
- CDir *parent = inode->get_parent_dir();
- if (parent)
- return parent->authority();
-
- // root, or dangling
- assert(inode->is_root()); // no dirs under danglers!?
- //assert(inode->is_root() || inode->is_dangling());
- */
-
- return inode->authority();
+ if (dir_auth == CDIR_AUTH_PARENT)
+ return inode->authority();
+ return dir_auth;
}
int CDir::dentry_authority(const string& dn )
{
// hashing -- subset of nodes have hashed the contents
if (is_hashing() && !hashed_subset.empty()) {
- int hashauth = mds->hash_dentry( inode->ino(), dn ); // hashed
+ int hashauth = cache->hash_dentry( inode->ino(), dn ); // hashed
if (hashed_subset.count(hashauth))
return hashauth;
}
// hashed
if (is_hashed()) {
- return mds->hash_dentry( inode->ino(), dn ); // hashed
+ return cache->hash_dentry( inode->ino(), dn ); // hashed
}
if (get_dir_auth() == CDIR_AUTH_PARENT) {
void CDir::auth_pin() {
if (auth_pins == 0)
- get(CDIR_PIN_AUTHPIN);
+ get(PIN_AUTHPIN);
auth_pins++;
dout(7) << "auth_pin on " << *this << " count now " << auth_pins << " + " << nested_auth_pins << endl;
void CDir::auth_unpin() {
auth_pins--;
if (auth_pins == 0)
- put(CDIR_PIN_AUTHPIN);
+ put(PIN_AUTHPIN);
dout(7) << "auth_unpin on " << *this << " count now " << auth_pins << " + " << nested_auth_pins << endl;
assert(auth_pins >= 0);
void CDir::unfreeze_tree()
{
dout(10) << "unfreeze_tree " << *this << endl;
- state_clear(CDIR_STATE_FROZENTREE);
-
- // unpin (may => FREEZEABLE) FIXME: is this order good?
- inode->auth_unpin();
- // waiters?
- finish_waiting(CDIR_WAIT_UNFREEZE);
+ if (state_test(CDIR_STATE_FROZENTREE)) {
+ // frozen. unfreeze.
+ state_clear(CDIR_STATE_FROZENTREE);
+
+ // unpin (may => FREEZEABLE) FIXME: is this order good?
+ inode->auth_unpin();
+
+ // waiters?
+ finish_waiting(CDIR_WAIT_UNFREEZE);
+ } else {
+ // freezing. stop it.
+ assert(state_test(CDIR_STATE_FREEZINGTREE));
+ state_clear(CDIR_STATE_FREEZINGTREE);
+
+ // cancel freeze waiters
+ finish_waiting(CDIR_WAIT_FREEZEABLE, -1);
+ }
}
bool CDir::is_freezing_tree()
if (dir->is_frozen_tree_root()) return true;
if (dir->is_import()) return false;
if (dir->is_hashed()) return false;
+ if (dir->is_frozen_tree_leaf()) return false;
if (dir->inode->parent)
dir = dir->inode->parent->dir;
else
#include "include/types.h"
#include "include/buffer.h"
+#include "mdstypes.h"
#include "config.h"
#include "common/DecayCounter.h"
#include "CInode.h"
class CDentry;
-class MDS;
+class MDCache;
class MDCluster;
class Context;
// directory authority types
// >= 0 is the auth mds
#define CDIR_AUTH_PARENT -1 // default
+#define CDIR_AUTH_UNKNOWN -2
#define CDIR_NONCE_EXPORT 1
#define CDIR_STATE_COMPLETE (1<<2) // the complete contents are in cache
#define CDIR_STATE_DIRTY (1<<3) // has been modified since last commit
-#define CDIR_STATE_FROZENTREE (1<<4) // root of tree (bounded by exports)
-#define CDIR_STATE_FREEZINGTREE (1<<5) // in process of freezing
-#define CDIR_STATE_FROZENDIR (1<<6)
-#define CDIR_STATE_FREEZINGDIR (1<<7)
+#define CDIR_STATE_FROZENTREE (1<<4) // root of tree (bounded by exports)
+#define CDIR_STATE_FREEZINGTREE (1<<5) // in process of freezing
+#define CDIR_STATE_FROZENTREELEAF (1<<6) // outer bound of frozen region (on import)
+#define CDIR_STATE_FROZENDIR (1<<7)
+#define CDIR_STATE_FREEZINGDIR (1<<8)
-#define CDIR_STATE_COMMITTING (1<<8) // mid-commit
-#define CDIR_STATE_FETCHING (1<<9) // currenting fetching
+#define CDIR_STATE_COMMITTING (1<<9) // mid-commit
+#define CDIR_STATE_FETCHING (1<<10) // currenting fetching
-#define CDIR_STATE_DELETED (1<<10)
+#define CDIR_STATE_DELETED (1<<11)
-#define CDIR_STATE_IMPORT (1<<11) // flag set if this is an import.
-#define CDIR_STATE_EXPORT (1<<12)
-#define CDIR_STATE_IMPORTINGEXPORT (1<<13)
+#define CDIR_STATE_IMPORT (1<<12) // flag set if this is an import.
+#define CDIR_STATE_EXPORT (1<<13)
+#define CDIR_STATE_IMPORTINGEXPORT (1<<14)
-#define CDIR_STATE_HASHED (1<<14) // if hashed
-#define CDIR_STATE_HASHING (1<<15)
-#define CDIR_STATE_UNHASHING (1<<16)
+#define CDIR_STATE_HASHED (1<<15) // if hashed
+#define CDIR_STATE_HASHING (1<<16)
+#define CDIR_STATE_UNHASHING (1<<17)
|CDIR_STATE_DIRTY)
#define CDIR_MASK_STATE_IMPORT_KEPT (CDIR_STATE_IMPORT\
|CDIR_STATE_EXPORT\
- |CDIR_STATE_IMPORTINGEXPORT)
+ |CDIR_STATE_IMPORTINGEXPORT\
+ |CDIR_STATE_FROZENTREE\
+ |CDIR_STATE_PROXY)
+
#define CDIR_MASK_STATE_EXPORT_KEPT (CDIR_STATE_HASHED\
|CDIR_STATE_FROZENTREE\
|CDIR_STATE_FROZENDIR\
|CDIR_STATE_EXPORT\
- |CDIR_STATE_PROXY)
+ |CDIR_STATE_PROXY)
// common states
#define CDIR_STATE_CLEAN 0
-// pins
-
-#define CDIR_PIN_CHILD 0
-#define CDIR_PIN_OPENED 1 // open by another node
-#define CDIR_PIN_WAITER 2 // waiter(s)
-
-#define CDIR_PIN_IMPORT 3
-#define CDIR_PIN_EXPORT 4
-#define CDIR_PIN_FREEZE 5
-#define CDIR_PIN_PROXY 6 // auth just changed.
-
-#define CDIR_PIN_AUTHPIN 7
-
-#define CDIR_PIN_IMPORTING 8
-#define CDIR_PIN_IMPORTINGEXPORT 9
-
-#define CDIR_PIN_HASHED 10
-#define CDIR_PIN_HASHING 11
-#define CDIR_PIN_DIRTY 12
-
-#define CDIR_PIN_REQUEST 13
-
-#define CDIR_NUM_PINS 14
typedef map<string, CDentry*> CDir_map_t;
-extern int cdir_pins[CDIR_NUM_PINS];
+//extern int cdir_pins[CDIR_NUM_PINS];
-class CDir {
+class CDir : public MDSCacheObject {
public:
+ // -- pins --
+ static const int PIN_CHILD = 0;
+ static const int PIN_OPENED = 1; // open by another node
+ static const int PIN_WAITER = 2; // waiter(s)
+ static const int PIN_IMPORT = 3;
+ static const int PIN_EXPORT = 4;
+ //static const int PIN_FREEZE = 5;
+ static const int PIN_FREEZELEAF = 6;
+ static const int PIN_PROXY = 7; // auth just changed.
+ static const int PIN_AUTHPIN = 8;
+ static const int PIN_IMPORTING = 9;
+ static const int PIN_IMPORTINGEXPORT = 10;
+ static const int PIN_HASHED = 11;
+ static const int PIN_HASHING = 12;
+ static const int PIN_DIRTY = 13;
+ static const int PIN_REQUEST = 14;
+ static const char *pin_name(int p) {
+ switch (p) {
+ case PIN_CHILD: return "child";
+ case PIN_OPENED: return "opened";
+ case PIN_WAITER: return "waiter";
+ case PIN_IMPORT: return "import";
+ case PIN_EXPORT: return "export";
+ //case PIN_FREEZE: return "freeze";
+ case PIN_FREEZELEAF: return "freezeleaf";
+ case PIN_PROXY: return "proxy";
+ case PIN_AUTHPIN: return "authpin";
+ case PIN_IMPORTING: return "importing";
+ case PIN_IMPORTINGEXPORT: return "importingexport";
+ case PIN_HASHED: return "hashed";
+ case PIN_HASHING: return "hashing";
+ case PIN_DIRTY: return "dirty";
+ case PIN_REQUEST: return "request";
+ default: assert(0);
+ }
+ }
+
+
+ public:
+ // context
+ MDCache *cache;
+
+ // my inode
CInode *inode;
protected:
CDir_map_t null_items; // null and foreign
size_t nitems; // non-null
size_t nnull; // null
- //size_t nauthitems;
- //size_t namesize;
// state
- unsigned state;
version_t version;
version_t committing_version;
- version_t last_committed_version;
+ version_t last_committed_version; // slight lie; we bump this on import.
+ version_t projected_version;
// authority, replicas
- set<int> open_by; // nodes that have me open
- map<int,int> open_by_nonce;
- int replica_nonce;
int dir_auth;
- // reference countin/pins
- int ref; // reference count
- set<int> ref_set;
-
// lock nesting, freeze
int auth_pins;
int nested_auth_pins;
map<int, pair< list<class InodeStat*>, list<string> > > hashed_readdir;
protected:
- // context
- MDS *mds;
// waiters
friend class CDirExport;
public:
- CDir(CInode *in, MDS *mds, bool auth);
+ CDir(CInode *in, MDCache *mdcache, bool auth);
CDir_map_t::iterator begin() { return items.begin(); }
CDir_map_t::iterator end() { return items.end(); }
size_t get_size() {
-
- //if ( is_auth() && !is_hashed()) assert(nauthitems == nitems);
- //if (!is_auth() && !is_hashed()) assert(nauthitems == 0);
-
return nitems;
}
size_t get_nitems() { return nitems; }
size_t get_nnull() { return nnull; }
- /*
- size_t get_auth_size() {
- assert(nauthitems <= nitems);
- return nauthitems;
- }
- */
/*
float get_popularity() {
return iter->second;
}
- CDentry* add_dentry( const string& dname, CInode *in=0 );
- CDentry* add_dentry( const string& dname, inodeno_t ino );
+ CDentry* add_dentry( const string& dname, CInode *in=0, bool auth=true );
+ CDentry* add_dentry( const string& dname, inodeno_t ino, bool auth=true );
void remove_dentry( CDentry *dn ); // delete dentry
void link_inode( CDentry *dn, inodeno_t ino );
void link_inode( CDentry *dn, CInode *in );
int get_dir_auth() { return dir_auth; }
void set_dir_auth(int d);
- bool is_open_by_anyone() { return !open_by.empty(); }
- bool is_open_by(int mds) { return open_by.count(mds); }
- int get_open_by_nonce(int mds) {
- map<int,int>::iterator it = open_by_nonce.find(mds);
- return it->second;
- }
- set<int>::iterator open_by_begin() { return open_by.begin(); }
- set<int>::iterator open_by_end() { return open_by.end(); }
- set<int>& get_open_by() { return open_by; }
-
- int get_replica_nonce() { assert(!is_auth()); return replica_nonce; }
-
- int open_by_add(int mds) {
- int nonce = 1;
-
- if (is_open_by(mds)) { // already had it?
- nonce = get_open_by_nonce(mds) + 1; // new nonce (+1)
- dout(10) << *this << " issuing new nonce " << nonce << " to mds" << mds << endl;
- open_by_nonce.erase(mds);
- } else {
- if (open_by.empty())
- get(CDIR_PIN_OPENED);
- open_by.insert(mds);
- }
- open_by_nonce.insert(pair<int,int>(mds,nonce)); // first! serial of 1.
- return nonce; // default nonce
- }
- void open_by_remove(int mds) {
- //if (!is_open_by(mds)) return;
- assert(is_open_by(mds));
-
- open_by.erase(mds);
- open_by_nonce.erase(mds);
- if (open_by.empty())
- put(CDIR_PIN_OPENED);
- }
- void open_by_clear() {
- if (!open_by.empty())
- put(CDIR_PIN_OPENED);
- open_by.clear();
- open_by_nonce.clear();
- }
-
-
+
// for giving to clients
void get_dist_spec(set<int>& ls, int auth) {
if (( popularity[MDS_POP_CURDOM].pop[META_POP_IRD].get() > g_conf.mds_bal_replicate_threshold)) {
//if (!cached_by.empty() && inode.ino > 1) dout(1) << "distributed spec for " << *this << endl;
- ls = open_by;
- if (!ls.empty()) ls.insert(auth);
+ for (map<int,int>::iterator p = replicas_begin();
+ p != replicas_end();
+ ++p)
+ ls.insert(p->first);
+ if (!ls.empty())
+ ls.insert(auth);
}
}
// -- state --
- unsigned get_state() { return state; }
- void reset_state(unsigned s) {
- state = s;
- dout(10) << " cdir:" << *this << " state reset" << endl;
- }
- void state_clear(unsigned mask) {
- state &= ~mask;
- dout(10) << " cdir:" << *this << " state -" << mask << " = " << state << endl;
- }
- void state_set(unsigned mask) {
- state |= mask;
- dout(10) << " cdir:" << *this << " state +" << mask << " = " << state << endl;
- }
- unsigned state_test(unsigned mask) { return state & mask; }
-
bool is_complete() { return state & CDIR_STATE_COMPLETE; }
bool is_dirty() { return state_test(CDIR_STATE_DIRTY); }
// -- dirtyness --
version_t get_version() { return version; }
- void float_version(version_t ge) {
- if (version < ge)
- version = ge;
- }
- void set_version(version_t v) { version = v; }
-
+ void set_version(version_t v) { projected_version = version = v; }
+ version_t get_projected_version() { return projected_version; }
+
version_t get_committing_version() { return committing_version; }
version_t get_last_committed_version() { return last_committed_version; }
// as in, we're committing the current version.
void set_committing_version() { committing_version = version; }
void set_last_committed_version(version_t v) { last_committed_version = v; }
- void mark_dirty();
+
+ version_t pre_dirty();
+ void _mark_dirty();
+ void mark_dirty(version_t pv);
void mark_clean();
void mark_complete() { state_set(CDIR_STATE_COMPLETE); }
bool is_clean() { return !state_test(CDIR_STATE_DIRTY); }
// -- reference counting --
- void put(int by);
- void get(int by);
- bool is_pinned_by(int by) {
- return ref_set.count(by);
- }
- bool is_pinned() { return ref > 0; }
- int get_ref() { return ref; }
- set<int>& get_ref_set() { return ref_set; }
+ void first_get();
+ void last_put();
+
void request_pin_get() {
- if (request_pins == 0) get(CDIR_PIN_REQUEST);
+ if (request_pins == 0) get(PIN_REQUEST);
request_pins++;
}
void request_pin_put() {
request_pins--;
- if (request_pins == 0) put(CDIR_PIN_REQUEST);
+ if (request_pins == 0) put(PIN_REQUEST);
}
bool is_frozen() { return is_frozen_dir() || is_frozen_tree(); }
bool is_frozen_tree();
bool is_frozen_tree_root() { return state & CDIR_STATE_FROZENTREE; }
+ bool is_frozen_tree_leaf() { return state & CDIR_STATE_FROZENTREELEAF; }
bool is_frozen_dir() { return state & CDIR_STATE_FROZENDIR; }
bool is_freezeable() {
// export
-typedef struct {
- inodeno_t ino;
- __uint64_t nitems; // actual real entries
- __uint64_t nden; // num dentries (including null ones)
- version_t version;
- unsigned state;
- meta_load_t popularity_justme;
- meta_load_t popularity_curdom;
- int dir_auth;
- int dir_rep;
- int nopen_by;
- int nrep_by;
- // ints follow
-} CDirExport_st;
-
class CDirExport {
- CDirExport_st st;
- set<int> open_by;
- map<int,int> open_by_nonce;
+ struct {
+ inodeno_t ino;
+ long nitems; // actual real entries
+ long nden; // num dentries (including null ones)
+ version_t version;
+ unsigned state;
+ meta_load_t popularity_justme;
+ meta_load_t popularity_curdom;
+ int dir_rep;
+ } st;
+ map<int,int> replicas;
set<int> rep_by;
public:
CDirExport(CDir *dir) {
memset(&st, 0, sizeof(st));
+ assert(dir->get_version() == dir->get_projected_version());
+
st.ino = dir->ino();
st.nitems = dir->nitems;
st.nden = dir->items.size();
st.version = dir->version;
st.state = dir->state;
- st.dir_auth = dir->dir_auth;
st.dir_rep = dir->dir_rep;
st.popularity_justme.take( dir->popularity[MDS_POP_JUSTME] );
dir->popularity[MDS_POP_NESTED] -= st.popularity_curdom;
rep_by = dir->dir_rep_by;
- open_by = dir->open_by;
- open_by_nonce = dir->open_by_nonce;
+ replicas = dir->replicas;
}
inodeno_t get_ino() { return st.ino; }
assert(dir->ino() == st.ino);
//dir->nitems = st.nitems;
- dir->version = st.version;
+
+ // set last_committed_version at old version
+ dir->committing_version = dir->last_committed_version = st.version;
+ dir->projected_version = dir->version = st.version; // this is bumped, below, if dirty
+
+ // twiddle state
if (dir->state & CDIR_STATE_HASHED)
- dir->state |= CDIR_STATE_AUTH; // just inherit auth flag when hashed
+ dir->state_set( CDIR_STATE_AUTH ); // just inherit auth flag when hashed
else
dir->state = (dir->state & CDIR_MASK_STATE_IMPORT_KEPT) | // remember import flag, etc.
(st.state & CDIR_MASK_STATE_EXPORTED);
- dir->dir_auth = st.dir_auth;
dir->dir_rep = st.dir_rep;
dir->popularity[MDS_POP_JUSTME] += st.popularity_justme;
dir->replica_nonce = 0; // no longer defined
- if (!dir->open_by.empty())
- dout(0) << "open_by not empty non import, " << *dir << ", " << dir->open_by << endl;
+ if (!dir->replicas.empty())
+ dout(0) << "replicas not empty non import, " << *dir << ", " << dir->replicas << endl;
dir->dir_rep_by = rep_by;
- dir->open_by = open_by;
- dout(12) << "open_by in export is " << open_by << ", dir now " << dir->open_by << endl;
- dir->open_by_nonce = open_by_nonce;
- if (!open_by.empty())
- dir->get(CDIR_PIN_OPENED);
- if (dir->is_dirty())
- dir->get(CDIR_PIN_DIRTY);
+ dir->replicas = replicas;
+ dout(12) << "replicas in export is " << replicas << ", dir now " << dir->replicas << endl;
+ if (!replicas.empty())
+ dir->get(CDir::PIN_OPENED);
+ if (dir->is_dirty()) {
+ dir->get(CDir::PIN_DIRTY);
+
+ // bump dir version + 1 if dirty
+ dir->projected_version = dir->version = st.version + 1;
+ }
}
void _encode(bufferlist& bl) {
- st.nrep_by = rep_by.size();
- st.nopen_by = open_by_nonce.size();
bl.append((char*)&st, sizeof(st));
-
- // open_by
- for (map<int,int>::iterator it = open_by_nonce.begin();
- it != open_by_nonce.end();
- it++) {
- int m = it->first;
- bl.append((char*)&m, sizeof(int));
- int n = it->second;
- bl.append((char*)&n, sizeof(int));
- }
-
- // rep_by
- for (set<int>::iterator it = rep_by.begin();
- it != rep_by.end();
- it++) {
- int m = *it;
- bl.append((char*)&m, sizeof(int));
- }
+ ::_encode(replicas, bl);
+ ::_encode(rep_by, bl);
}
int _decode(bufferlist& bl, int off = 0) {
bl.copy(off, sizeof(st), (char*)&st);
off += sizeof(st);
-
- // open_by
- for (int i=0; i<st.nopen_by; i++) {
- int m,n;
- bl.copy(off, sizeof(int), (char*)&m);
- off += sizeof(int);
- bl.copy(off, sizeof(int), (char*)&n);
- off += sizeof(int);
- open_by.insert(m);
- open_by_nonce.insert(pair<int,int>(m,n));
- }
-
- // rep_by
- for (int i=0; i<st.nrep_by; i++) {
- int m;
- bl.copy(off, sizeof(int), (char*)&m);
- off += sizeof(int);
- rep_by.insert(m);
- }
-
+ ::_decode(replicas, bl, off);
+ ::_decode(rep_by, bl, off);
return off;
}
#include "config.h"
#undef dout
-#define dout(x) if (x <= g_conf.debug || x <= g_conf.debug_mds) cout << g_clock.now() << " mds" << mdcache->mds->get_nodeid() << ".cache.inode(" << inode.ino << ") "
+#define dout(x) if (x <= g_conf.debug || x <= g_conf.debug_mds) cout << g_clock.now() << " mds" << mdcache->mds->get_nodeid() << ".cache.ino(" << inode.ino << ") "
-int cinode_pins[CINODE_NUM_PINS]; // counts
+//int cinode_pins[CINODE_NUM_PINS]; // counts
ostream& operator<<(ostream& out, CInode& in)
out << "[inode " << in.inode.ino << " " << path << (in.is_dir() ? "/ ":" ");
if (in.is_auth()) {
out << "auth";
- if (in.is_cached_by_anyone()) {
- //out << "+" << in.get_cached_by();
- for (set<int>::iterator it = in.cached_by_begin();
- it != in.cached_by_end();
- it++) {
- out << "+" << *it << "." << in.get_cached_by_nonce(*it);
- }
- }
+ if (in.is_replicated())
+ out << in.get_replicas();
} else {
out << "rep@" << in.authority();
- //if (in.get_replica_nonce() > 1)
- out << "." << in.get_replica_nonce();
+ out << "." << in.get_replica_nonce();
assert(in.get_replica_nonce() >= 0);
}
out << " hard=" << in.hardlock;
out << " file=" << in.filelock;
- if (in.is_pinned()) {
+ if (in.get_num_ref()) {
out << " |";
for(set<int>::iterator it = in.get_ref_set().begin();
it != in.get_ref_set().end();
it++)
- if (*it < CINODE_NUM_PINS)
- out << " " << cinode_pin_names[*it];
- else
- out << " " << *it;
+ out << " " << CInode::pin_name(*it);
}
// hack: spit out crap on which clients have caps
// ====== CInode =======
-CInode::CInode(MDCache *c, bool auth) : LRUObject() {
+CInode::CInode(MDCache *c, bool auth) {
mdcache = c;
ref = 0;
+ num_parents = 0;
parent = NULL;
dir = NULL; // CDir opened separately
state = 0;
- committing_version = committed_version = 0;
-
- if (auth) state_set(CINODE_STATE_AUTH);
+ if (auth) state_set(STATE_AUTH);
}
CInode::~CInode() {
if (dir) { delete dir; dir = 0; }
}
+
+// pins
+
+void CInode::first_get()
+{
+ // pin my dentry?
+ if (parent)
+ parent->get(CDentry::PIN_INODEPIN);
+}
+
+void CInode::last_put()
+{
+ // unpin my dentry?
+ if (parent) {
+ parent->put(CDentry::PIN_INODEPIN);
+ }
+ if (num_parents == 0 && get_num_ref() == 0)
+ mdcache->inode_expire_queue.push_back(this); // queue myself for garbage collection
+}
+
+void CInode::get_parent()
+{
+ num_parents++;
+}
+void CInode::put_parent()
+{
+ num_parents--;
+ if (num_parents == 0 && get_num_ref() == 0)
+ mdcache->inode_expire_queue.push_back(this); // queue myself for garbage collection
+}
+
+
+
CDir *CInode::get_parent_dir()
{
if (parent)
return is_auth();
}
-CDir *CInode::get_or_open_dir(MDS *mds)
+CDir *CInode::get_or_open_dir(MDCache *mdcache)
{
assert(is_dir());
// only auth can open dir alone.
assert(is_auth());
- set_dir( new CDir(this, mds, true) );
+ set_dir( new CDir(this, mdcache, true) );
dir->dir_auth = -1;
return dir;
}
return dir;
}
+void CInode::close_dir()
+{
+ assert(dir);
+ assert(dir->get_num_ref() == 0);
+ delete dir;
+ dir = 0;
+}
+
+
void CInode::set_auth(bool a)
{
if (!is_dangling() && !is_root() &&
is_auth() != a) {
- /*
- CDir *dir = get_parent_dir();
- if (is_auth() && !a)
- dir->nauthitems--;
- else
- dir->nauthitems++;
- */
}
- if (a) state_set(CINODE_STATE_AUTH);
- else state_clear(CINODE_STATE_AUTH);
+ if (a) state_set(STATE_AUTH);
+ else state_clear(STATE_AUTH);
}
parent->dir->inode->ino(),
parent->name) );
}
- else if (state_test(CINODE_STATE_DANGLING)) {
+ else if (state_test(STATE_DANGLING)) {
dout(7) << "make_anchor_trace dangling " << ino() << " on mds " << dangling_auth << endl;
string ref_dn;
trace.push_back( new Anchor(ino(),
-void CInode::mark_dirty() {
+version_t CInode::pre_dirty()
+{
+ assert(parent);
+ return parent->pre_dirty();
+}
+
+void CInode::_mark_dirty()
+{
+ if (!state_test(STATE_DIRTY)) {
+ state_set(STATE_DIRTY);
+ get(PIN_DIRTY);
+ }
+}
+
+void CInode::mark_dirty(version_t pv) {
dout(10) << "mark_dirty " << *this << endl;
- if (!parent) {
- dout(10) << " dangling, not marking dirty!" << endl;
- return;
- }
+ assert(parent);
/*
NOTE: I may already be dirty, but this fn _still_ needs to be called so that
updated below.
*/
- // only auth can get dirty. "dirty" async data in replicas is relative to (say) filelock state, not dirty flag.
+ // only auth can get dirty. "dirty" async data in replicas is relative to
+ // filelock state, not the dirty flag.
assert(is_auth());
-
- // touch my private version
- inode.version++;
- if (!(state & CINODE_STATE_DIRTY)) {
- state |= CINODE_STATE_DIRTY;
- get(CINODE_PIN_DIRTY);
- }
- // relative to parent dir:
- if (parent) {
- // dir is now dirty (if it wasn't already)
- parent->dir->mark_dirty();
-
- // i now live in that (potentially newly dirty) version
- parent_dir_version = parent->dir->get_version();
- }
+ // touch my private version
+ assert(inode.version < pv);
+ inode.version = pv;
+ _mark_dirty();
+
+ // mark dentry too
+ parent->mark_dirty(pv);
}
void CInode::mark_clean()
{
dout(10) << " mark_clean " << *this << endl;
- if (state & CINODE_STATE_DIRTY) {
- state &= ~CINODE_STATE_DIRTY;
- put(CINODE_PIN_DIRTY);
+ if (state_test(STATE_DIRTY)) {
+ state_clear(STATE_DIRTY);
+ put(PIN_DIRTY);
}
}
}
-// old state encoders
-
-/*
-void CInode::encode_basic_state(bufferlist& r)
-{
- // inode
- r.append((char*)&inode, sizeof(inode));
- ::_encode(cached_by, r);
- ::_encode(cached_by_nonce, r);
-}
-
-void CInode::decode_basic_state(bufferlist& r, int& off)
-{
- // inode
- r.copy(0,sizeof(inode_t), (char*)&inode);
- off += sizeof(inode_t);
-
- bool empty = cached_by.empty();
- ::_decode(cached_by, r, off);
- ::_decode(cached_by_nonce, r, off);
- if (!empty)
- get(CINODE_PIN_CACHED);
-}
-*/
-
// waiting
// this inode.
if (waiting.size() == 0)
- get(CINODE_PIN_WAITER);
+ get(PIN_WAITER);
waiting.insert(pair<int,Context*>(tag,c));
dout(10) << "add_waiter " << tag << " " << c << " on " << *this << endl;
}
if (waiting.empty())
- put(CINODE_PIN_WAITER);
+ put(PIN_WAITER);
}
void CInode::finish_waiting(int mask, int result)
void CInode::auth_pin() {
if (auth_pins == 0)
- get(CINODE_PIN_AUTHPIN);
+ get(PIN_AUTHPIN);
auth_pins++;
dout(7) << "auth_pin on " << *this << " count now " << auth_pins << " + " << nested_auth_pins << endl;
void CInode::auth_unpin() {
auth_pins--;
if (auth_pins == 0)
- put(CINODE_PIN_AUTHPIN);
+ put(PIN_AUTHPIN);
dout(7) << "auth_unpin on " << *this << " count now " << auth_pins << " + " << nested_auth_pins << endl;
int CInode::authority() {
if (is_dangling())
- return dangling_auth; // explicit
- if (is_root())
- return 0; // i am root
- assert(parent);
- return parent->dir->dentry_authority( parent->name );
+ return dangling_auth; // explicit
+
+ if (is_root()) { // i am root
+ if (dir)
+ return dir->get_dir_auth(); // bit of a chicken/egg issue here!
+ else
+ return CDIR_AUTH_UNKNOWN;
+ }
+
+ if (parent)
+ return parent->dir->dentry_authority( parent->name );
+
+ return -1; // undefined (inode must not be linked yet!)
}
assert(is_auth());
// relax locks?
- if (!is_cached_by_anyone())
+ if (!is_replicated())
replicate_relax_locks();
// return the thinger
- int nonce = cached_by_add( rep );
+ int nonce = add_replica( rep );
return new CInodeDiscover( this, nonce );
}
#include "include/types.h"
#include "include/lru.h"
+#include "mdstypes.h"
+
#include "CDentry.h"
#include "Lock.h"
#include "Capability.h"
-#include "mdstypes.h"
#include <cassert>
#include <list>
using namespace std;
-
-
-
-// pins for keeping an item in cache (and debugging)
-#define CINODE_PIN_DIR 0
-#define CINODE_PIN_CACHED 1
-#define CINODE_PIN_DIRTY 2 // must flush
-#define CINODE_PIN_PROXY 3 // can't expire yet
-#define CINODE_PIN_WAITER 4 // waiter
-
-#define CINODE_PIN_CAPS 5 // local fh's
-
-#define CINODE_PIN_DNDIRTY 7 // dentry is dirty
-
-#define CINODE_PIN_AUTHPIN 8
-#define CINODE_PIN_IMPORTING 9 // multipurpose, for importing
-#define CINODE_PIN_REQUEST 10 // request is logging, finishing
-#define CINODE_PIN_RENAMESRC 11 // pinned on dest for foreign rename
-#define CINODE_PIN_ANCHORING 12
-
-#define CINODE_PIN_OPENINGDIR 13
-
-#define CINODE_PIN_DENTRYLOCK 14
-
-#define CINODE_NUM_PINS 15
-
-static char *cinode_pin_names[CINODE_NUM_PINS] = {
- "dir",
- "cached",
- "dirty",
- "proxy",
- "waiter",
- "caps",
- "--",
- "dndirty",
- "authpin",
- "imping",
- "request",
- "rensrc",
- "anching",
- "opdir",
- "dnlock"
-};
-
-
-
-
-
-
// wait reasons
#define CINODE_WAIT_AUTHPINNABLE CDIR_WAIT_UNFREEZE
// waiters: write_hard_start, read_file_start, write_file_start (mdcache)
#define CINODE_WAIT_CAPS (1<<30)
-
-
-
#define CINODE_WAIT_ANY 0xffffffff
-// state
-#define CINODE_STATE_AUTH (1<<0)
-#define CINODE_STATE_ROOT (1<<1)
-
-#define CINODE_STATE_DIRTY (1<<2)
-#define CINODE_STATE_UNSAFE (1<<3) // not logged yet
-#define CINODE_STATE_DANGLING (1<<4) // delete me when i expire; i have no dentry
-#define CINODE_STATE_UNLINKING (1<<5)
-#define CINODE_STATE_PROXY (1<<6) // can't expire yet
-#define CINODE_STATE_EXPORTING (1<<7) // on nonauth bystander.
-
-#define CINODE_STATE_ANCHORING (1<<8)
-
-#define CINODE_STATE_OPENINGDIR (1<<9)
-
-//#define CINODE_STATE_RENAMING (1<<8) // moving me
-//#define CINODE_STATE_RENAMINGTO (1<<9) // rename target (will be unlinked)
-
// misc
#define CINODE_EXPORT_NONCE 1 // nonce given to replicas created by export
class Context;
class CDentry;
class CDir;
-class MDS;
class Message;
class CInode;
class CInodeDiscover;
class MDCache;
-//class MInodeSyncStart;
ostream& operator<<(ostream& out, CInode& in);
-extern int cinode_pins[CINODE_NUM_PINS]; // counts
+// cached inode wrapper
+class CInode : public MDSCacheObject {
+ public:
+ // -- pins --
+ static const int PIN_CACHED = 1;
+ static const int PIN_DIR = 2;
+ static const int PIN_DIRTY = 4; // must flush
+ static const int PIN_PROXY = 5; // can't expire yet
+ static const int PIN_WAITER = 6; // waiter
+ static const int PIN_CAPS = 7; // local fh's
+ static const int PIN_AUTHPIN = 8;
+ static const int PIN_IMPORTING = 9; // multipurpose, for importing
+ static const int PIN_REQUEST = 10; // request is logging, finishing
+ static const int PIN_RENAMESRC = 11; // pinned on dest for foreign rename
+ static const int PIN_ANCHORING = 12;
+
+ static const int PIN_OPENINGDIR = 13;
+
+ static const int PIN_DENTRYLOCK = 14;
+
+ static const char *pin_name(int p) {
+ switch (p) {
+ case PIN_CACHED: return "cached";
+ case PIN_DIR: return "dir";
+ case PIN_DIRTY: return "dirty";
+ case PIN_PROXY: return "proxy";
+ case PIN_WAITER: return "waiter";
+ case PIN_CAPS: return "caps";
+ case PIN_AUTHPIN: return "authpin";
+ case PIN_IMPORTING: return "importing";
+ case PIN_REQUEST: return "request";
+ case PIN_RENAMESRC: return "renamesrc";
+ case PIN_ANCHORING: return "anchoring";
+ case PIN_OPENINGDIR: return "openingdir";
+ case PIN_DENTRYLOCK: return "dentrylock";
+ default: assert(0);
+ }
+ }
+
+ // state
+ static const int STATE_AUTH = (1<<0);
+ static const int STATE_ROOT = (1<<1);
+ static const int STATE_DIRTY = (1<<2);
+ static const int STATE_UNSAFE = (1<<3); // not logged yet
+ static const int STATE_DANGLING = (1<<4); // delete me when i expire; i have no dentry
+ static const int STATE_UNLINKING = (1<<5);
+ static const int STATE_PROXY = (1<<6); // can't expire yet
+ static const int STATE_EXPORTING = (1<<7); // on nonauth bystander.
+ static const int STATE_ANCHORING = (1<<8);
+ static const int STATE_OPENINGDIR = (1<<9);
+ //static const int STATE_RENAMING = (1<<8); // moving me
+ //static const int STATE_RENAMINGTO = (1<<9); // rename target (will be unlinked)
+
+
-// cached inode wrapper
-class CInode : public LRUObject {
public:
MDCache *mdcache;
CDir *dir; // directory, if we have it opened.
string symlink; // symlink dest, if symlink
- // inode metadata locks
- CLock hardlock;
- CLock filelock;
-
protected:
- int ref; // reference count
- set<int> ref_set;
- version_t parent_dir_version; // parent dir version when i was last touched.
- version_t committing_version;
- version_t committed_version;
-
- unsigned state;
-
// parent dentries in cache
+ int num_parents;
CDentry *parent; // primary link
set<CDentry*> remote_parents; // if hard linked
// -- distributed caching
- set<int> cached_by; // [auth] mds's that cache me.
- /* NOTE: on replicas, this doubles as replicated_by, but the
- cached_by_* access methods below should NOT be used in those
- cases, as the semantics are different! */
- map<int,int> cached_by_nonce; // [auth] nonce issued to each replica
- int replica_nonce; // [replica] defined on replica
-
int dangling_auth; // explicit auth, when dangling.
int num_request_pins;
// waiters
multimap<int, Context*> waiting;
+
+ // -- distributed state --
+public:
+ // inode metadata locks
+ CLock hardlock;
+ CLock filelock;
+protected:
// file capabilities
map<int, Capability> client_caps; // client -> caps
-
map<int, int> mds_caps_wanted; // [auth] mds -> caps wanted
int replica_caps_wanted; // [replica] what i've requested from auth
utime_t replica_caps_wanted_keep_until;
bool is_anchored() { return inode.anchored; }
- bool is_root() { return state & CINODE_STATE_ROOT; }
- bool is_proxy() { return state & CINODE_STATE_PROXY; }
+ bool is_root() { return state & STATE_ROOT; }
+ bool is_proxy() { return state & STATE_PROXY; }
- bool is_auth() { return state & CINODE_STATE_AUTH; }
+ bool is_auth() { return state & STATE_AUTH; }
void set_auth(bool auth);
- bool is_replica() { return !is_auth(); }
- int get_replica_nonce() { assert(!is_auth()); return replica_nonce; }
inodeno_t ino() { return inode.ino; }
inode_t& get_inode() { return inode; }
CInode *get_parent_inode();
CInode *get_realm_root(); // import, hash, or root
- CDir *get_or_open_dir(MDS *mds);
+ CDir *get_or_open_dir(MDCache *mdcache);
CDir *set_dir(CDir *newdir);
+ void close_dir();
bool dir_is_auth();
// -- state --
- unsigned get_state() { return state; }
- void state_clear(unsigned mask) { state &= ~mask; }
- void state_set(unsigned mask) { state |= mask; }
- unsigned state_test(unsigned mask) { return state & mask; }
+ bool is_unsafe() { return state & STATE_UNSAFE; }
+ bool is_dangling() { return state & STATE_DANGLING; }
+ bool is_unlinking() { return state & STATE_UNLINKING; }
- bool is_unsafe() { return state & CINODE_STATE_UNSAFE; }
- bool is_dangling() { return state & CINODE_STATE_DANGLING; }
- bool is_unlinking() { return state & CINODE_STATE_UNLINKING; }
-
- void mark_unsafe() { state |= CINODE_STATE_UNSAFE; }
- void mark_safe() { state &= ~CINODE_STATE_UNSAFE; }
+ void mark_unsafe() { state |= STATE_UNSAFE; }
+ void mark_safe() { state &= ~STATE_UNSAFE; }
// -- state encoding --
//void encode_basic_state(bufferlist& r);
// -- dirtyness --
version_t get_version() { return inode.version; }
- version_t get_parent_dir_version() { return parent_dir_version; }
- void float_parent_dir_version(version_t ge) {
- if (parent_dir_version < ge)
- parent_dir_version = ge;
- }
- version_t get_committing_version() { return committing_version; }
- version_t get_last_committed_version() { return committed_version; }
- void set_committing_version(version_t v) { committing_version = v; }
- void set_committed_version() {
- committed_version = committing_version;
- committing_version = 0;
- }
- bool is_dirty() { return state & CINODE_STATE_DIRTY; }
+ bool is_dirty() { return state & STATE_DIRTY; }
bool is_clean() { return !is_dirty(); }
- void mark_dirty();
+ version_t pre_dirty();
+ void _mark_dirty();
+ void mark_dirty(version_t projected_dirv);
void mark_clean();
- // -- cached_by -- to be used ONLY when we're authoritative or cacheproxy
- bool is_cached_by_anyone() { return !cached_by.empty(); }
- bool is_cached_by(int mds) { return cached_by.count(mds); }
- int num_cached_by() { return cached_by.size(); }
- // cached_by_add returns a nonce
- int cached_by_add(int mds) {
- int nonce = 1;
- if (is_cached_by(mds)) { // already had it?
- nonce = get_cached_by_nonce(mds) + 1; // new nonce (+1)
- dout(10) << *this << " issuing new nonce " << nonce << " to mds" << mds << endl;
- cached_by_nonce.erase(mds);
- } else {
- if (cached_by.empty())
- get(CINODE_PIN_CACHED);
- cached_by.insert(mds);
- }
- cached_by_nonce.insert(pair<int,int>(mds,nonce)); // first! serial of 1.
- return nonce; // default nonce
- }
- void cached_by_add(int mds, int nonce) {
- if (cached_by.empty())
- get(CINODE_PIN_CACHED);
- cached_by.insert(mds);
- cached_by_nonce.insert(pair<int,int>(mds,nonce));
- }
- int get_cached_by_nonce(int mds) {
- map<int,int>::iterator it = cached_by_nonce.find(mds);
- return it->second;
- }
- void cached_by_remove(int mds) {
- //if (!is_cached_by(mds)) return;
- assert(is_cached_by(mds));
-
- cached_by.erase(mds);
- cached_by_nonce.erase(mds);
- if (cached_by.empty())
- put(CINODE_PIN_CACHED);
- }
- void cached_by_clear() {
- if (cached_by.size())
- put(CINODE_PIN_CACHED);
- cached_by.clear();
- cached_by_nonce.clear();
- }
- set<int>::iterator cached_by_begin() { return cached_by.begin(); }
- set<int>::iterator cached_by_end() { return cached_by.end(); }
- set<int>& get_cached_by() { return cached_by; }
CInodeDiscover* replicate_to(int rep);
void finish_waiting(int mask, int result = 0);
+ bool is_hardlock_write_wanted() {
+ return waiting_for(CINODE_WAIT_HARDW);
+ }
+ bool is_filelock_write_wanted() {
+ return waiting_for(CINODE_WAIT_FILEW);
+ }
+
// -- caps -- (new)
// client caps
map<int,Capability>& get_client_caps() { return client_caps; }
void add_client_cap(int client, Capability& cap) {
if (client_caps.empty())
- get(CINODE_PIN_CAPS);
+ get(PIN_CAPS);
assert(client_caps.count(client) == 0);
client_caps[client] = cap;
}
assert(client_caps.count(client) == 1);
client_caps.erase(client);
if (client_caps.empty())
- put(CINODE_PIN_CAPS);
+ put(PIN_CAPS);
}
Capability* get_client_cap(int client) {
if (client_caps.count(client))
/*
void set_client_caps(map<int,Capability>& cl) {
if (client_caps.empty() && !cl.empty())
- get(CINODE_PIN_CAPS);
+ get(PIN_CAPS);
client_caps.clear();
client_caps = cl;
}
*/
void take_client_caps(map<int,Capability>& cl) {
if (!client_caps.empty())
- put(CINODE_PIN_CAPS);
+ put(PIN_CAPS);
cl = client_caps;
client_caps.clear();
}
void merge_client_caps(map<int,Capability>& cl, set<int>& new_client_caps) {
if (client_caps.empty() && !cl.empty())
- get(CINODE_PIN_CAPS);
+ get(PIN_CAPS);
for (map<int,Capability>::iterator it = cl.begin();
it != cl.end();
it++) {
void replicate_relax_locks() {
assert(is_auth());
- assert(!is_cached_by_anyone());
+ assert(!is_replicated());
dout(10) << " relaxing locks on " << *this << endl;
if (hardlock.get_state() == LOCK_LOCK &&
linked to an active_request, so they're automatically cleaned
up when a request is finished. pin at will! */
void request_pin_get() {
- if (num_request_pins == 0) get(CINODE_PIN_REQUEST);
+ if (num_request_pins == 0) get(PIN_REQUEST);
num_request_pins++;
}
void request_pin_put() {
num_request_pins--;
- if (num_request_pins == 0) put(CINODE_PIN_REQUEST);
+ if (num_request_pins == 0) put(PIN_REQUEST);
assert(num_request_pins >= 0);
}
-
- bool is_pinned() { return ref > 0; }
- set<int>& get_ref_set() { return ref_set; }
- void put(int by) {
- cinode_pins[by]--;
- if (ref == 0 || ref_set.count(by) != 1) {
- dout(7) << " bad put " << *this << " by " << by << " " << cinode_pin_names[by] << " was " << ref << " (" << ref_set << ")" << endl;
- assert(ref_set.count(by) == 1);
- assert(ref > 0);
- }
- ref--;
- ref_set.erase(by);
- if (ref == 0)
- lru_unpin();
- dout(7) << " put " << *this << " by " << by << " " << cinode_pin_names[by] << " now " << ref << " (" << ref_set << ")" << endl;
- }
- void get(int by) {
- cinode_pins[by]++;
- if (ref == 0)
- lru_pin();
- if (ref_set.count(by)) {
- dout(7) << " bad get " << *this << " by " << by << " " << cinode_pin_names[by] << " was " << ref << " (" << ref_set << ")" << endl;
- assert(ref_set.count(by) == 0);
- }
- ref++;
- ref_set.insert(by);
- dout(7) << " get " << *this << " by " << by << " " << cinode_pin_names[by] << " now " << ref << " (" << ref_set << ")" << endl;
+ void bad_put(int by) {
+ dout(7) << " bad put " << *this << " by " << by << " " << pin_name(by) << " was " << ref << " (" << ref_set << ")" << endl;
+ assert(ref_set.count(by) == 1);
+ assert(ref > 0);
}
- bool is_pinned_by(int by) {
- return ref_set.count(by);
+ void bad_get(int by) {
+ dout(7) << " bad get " << *this << " by " << by << " " << pin_name(by) << " was " << ref << " (" << ref_set << ")" << endl;
+ assert(ref_set.count(by) == 0);
}
+ void first_get();
+ void last_put();
+
// -- hierarchy stuff --
+private:
+ void get_parent();
+ void put_parent();
+
+public:
void set_primary_parent(CDentry *p) {
+ assert(parent == 0);
parent = p;
+ get_parent();
}
void remove_primary_parent(CDentry *dn) {
assert(dn == parent);
parent = 0;
+ put_parent();
}
void add_remote_parent(CDentry *p) {
+ if (remote_parents.empty())
+ get_parent();
remote_parents.insert(p);
}
void remove_remote_parent(CDentry *p) {
remote_parents.erase(p);
+ if (remote_parents.empty())
+ put_parent();
}
int num_remote_parents() {
return remote_parents.size();
int num_caps;
} st;
- set<int> cached_by;
- map<int,int> cached_by_nonce;
+ map<int,int> replicas;
map<int,Capability> cap_map;
CLock hardlock,filelock;
CInodeExport(CInode *in) {
st.inode = in->inode;
st.is_dirty = in->is_dirty();
- cached_by = in->cached_by;
- cached_by_nonce = in->cached_by_nonce;
+ replicas = in->replicas;
hardlock = in->hardlock;
filelock = in->filelock;
-
+
st.popularity_justme.take( in->popularity[MDS_POP_JUSTME] );
st.popularity_curdom.take( in->popularity[MDS_POP_CURDOM] );
in->popularity[MDS_POP_ANYDOM] -= st.popularity_curdom;
in->popularity[MDS_POP_ANYDOM] += st.popularity_curdom;
in->popularity[MDS_POP_NESTED] += st.popularity_curdom;
- if (st.is_dirty) {
- in->mark_dirty();
- }
+ if (st.is_dirty)
+ in->_mark_dirty();
- in->cached_by.clear();
- in->cached_by = cached_by;
- in->cached_by_nonce = cached_by_nonce;
- if (!cached_by.empty())
- in->get(CINODE_PIN_CACHED);
+ in->replicas = replicas;
+ if (!replicas.empty())
+ in->get(CInode::PIN_CACHED);
in->hardlock = hardlock;
in->filelock = filelock;
bl.append((char*)&st, sizeof(st));
// cached_by + nonce
- ::_encode(cached_by, bl);
- ::_encode(cached_by_nonce, bl);
+ ::_encode(replicas, bl);
hardlock.encode_state(bl);
filelock.encode_state(bl);
bl.copy(off, sizeof(st), (char*)&st);
off += sizeof(st);
- ::_decode(cached_by, bl, off);
- ::_decode(cached_by_nonce, bl, off);
+ ::_decode(replicas, bl, off);
hardlock.decode_state(bl, off);
filelock.decode_state(bl, off);
#include <ext/hash_map>
using namespace __gnu_cxx;
+
+/*
+ * this structure is used by the MDS purely so that
+ * it can remember client addresses (entity_inst_t)
+ * while processing request(s) on behalf of clients.
+ * as such it's only really a sort of short-term cache.
+ *
+ * it also remembers which clients mounted via this MDS,
+ * for the same reason (so that mounted clients can be
+ * contacted if necessary).
+ */
class ClientMap {
hash_map<int,entity_inst_t> client_inst;
set<int> client_mount;
// -- lock... hard or file
+class Message;
+
class CLock {
protected:
// lock state
char state;
set<int> gather_set; // auth
- int nread, nwrite;
+
+ // local state
+ int nread;
+ Message *wrlock_by;
public:
CLock() :
- state(LOCK_LOCK),
+ state(LOCK_SYNC),
nread(0),
- nwrite(0) {
+ wrlock_by(0) {
}
// encode/decode
void encode_state(bufferlist& bl) {
bl.append((char*)&state, sizeof(state));
- bl.append((char*)&nread, sizeof(nread));
- bl.append((char*)&nwrite, sizeof(nwrite));
-
_encode(gather_set, bl);
+
+ //bl.append((char*)&nread, sizeof(nread));
+ //bl.append((char*)&nwrite, sizeof(nwrite));
}
void decode_state(bufferlist& bl, int& off) {
bl.copy(off, sizeof(state), (char*)&state);
off += sizeof(state);
- bl.copy(off, sizeof(nread), (char*)&nread);
- off += sizeof(nread);
- bl.copy(off, sizeof(nwrite), (char*)&nwrite);
- off += sizeof(nwrite);
-
_decode(gather_set, bl, off);
+
+ //bl.copy(off, sizeof(nread), (char*)&nread);
+ //off += sizeof(nread);
+ //bl.copy(off, sizeof(nwrite), (char*)&nwrite);
+ //off += sizeof(nwrite);
}
char get_state() { return state; }
// gather set
set<int>& get_gather_set() { return gather_set; }
- void init_gather(set<int>& i) {
- gather_set = i;
+ void init_gather(const map<int,int>& i) {
+ for (map<int,int>::const_iterator p = i.begin(); p != i.end(); ++p)
+ gather_set.insert(p->first);
}
bool is_gathering(int i) {
return gather_set.count(i);
}
int get_nread() { return nread; }
- int get_write() { return ++nwrite; }
- int put_write() {
- assert(nwrite>0);
- return --nwrite;
+ void get_write(Message *who) {
+ assert(wrlock_by == 0);
+ wrlock_by = who;
+ }
+ void put_write() {
+ assert(wrlock_by);
+ wrlock_by = 0;
}
- int get_nwrite() { return nwrite; }
+ bool is_wrlocked() { return wrlock_by ? true:false; }
+ Message *get_wrlocked_by() { return wrlock_by; }
bool is_used() {
- return (nwrite+nread)>0 ? true:false;
+ return (is_wrlocked() || (nread>0)) ? true:false;
}
-
+
// stable
bool is_stable() {
bool can_write(bool auth) {
if (auth)
- return (state == LOCK_LOCK);
+ return (state == LOCK_LOCK) && !is_wrlocked();
else
return false;
}
if (!l.get_gather_set().empty()) out << " g=" << l.get_gather_set();
if (l.get_nread())
- out << " " << l.get_nread() << "r";
- if (l.get_nwrite())
- out << " " << l.get_nwrite() << "w";
+ out << " r=" << l.get_nread();
+ if (l.is_wrlocked())
+ out << " w=" << l.get_wrlocked_by();
// rw?
/*
#include "include/filepath.h"
-#include "events/EInodeUpdate.h"
-#include "events/EDirUpdate.h"
+#include "events/EString.h"
+#include "events/EUpdate.h"
#include "events/EUnlink.h"
#include "msg/Messenger.h"
}
+void Locker::send_lock_message(CInode *in, int msg, int type)
+{
+ for (map<int,int>::iterator it = in->replicas_begin();
+ it != in->replicas_end();
+ it++) {
+ MLock *m = new MLock(msg, mds->get_nodeid());
+ m->set_ino(in->ino(), type);
+ mds->send_message_mds(m, it->first, MDS_PORT_LOCKER);
+ }
+}
+
+
+void Locker::send_lock_message(CInode *in, int msg, int type, bufferlist& data)
+{
+ for (map<int,int>::iterator it = in->replicas_begin();
+ it != in->replicas_end();
+ it++) {
+ MLock *m = new MLock(msg, mds->get_nodeid());
+ m->set_ino(in->ino(), type);
+ m->set_data(data);
+ mds->send_message_mds(m, it->first, MDS_PORT_LOCKER);
+ }
+}
+
+void Locker::send_lock_message(CDentry *dn, int msg)
+{
+ for (map<int,int>::iterator it = dn->replicas_begin();
+ it != dn->replicas_end();
+ it++) {
+ MLock *m = new MLock(msg, mds->get_nodeid());
+ m->set_dn(dn->dir->ino(), dn->name);
+ mds->send_message_mds(m, it->first, MDS_PORT_LOCKER);
+ }
+}
+
// file i/o -----------------------------------------
it->second.get_last_seq(),
it->second.pending(),
it->second.wanted()),
- MSG_ADDR_CLIENT(it->first), mds->clientmap.get_inst(it->first),
+ mds->clientmap.get_inst(it->first),
0, MDS_PORT_LOCKER);
}
}
MClientFileCaps *r = new MClientFileCaps(in->inode,
0, 0, 0,
MClientFileCaps::FILECAP_RELEASE);
- mds->messenger->send_message(r, m->get_source(), m->get_source_inst(), 0, MDS_PORT_LOCKER);
+ mds->messenger->send_message(r, m->get_source_inst(), 0, MDS_PORT_LOCKER);
}
// merge in atime?
}
if (dirty)
- mds->mdlog->submit_entry(new EInodeUpdate(in));
+ mds->mdlog->submit_entry(new EString("cap inode update dirty fixme"));
}
// reevaluate, waiters
// if not replicated, i can twiddle lock at will
if (in->is_auth() &&
- !in->is_cached_by_anyone() &&
+ !in->is_replicated() &&
in->hardlock.get_state() != LOCK_LOCK)
in->hardlock.set_state(LOCK_LOCK);
}
in->auth_pin(); // ugh, can't condition this on nwrite==0 bc we twiddle that in handle_lock_*
- in->hardlock.get_write();
+ in->hardlock.get_write(m);
return true;
}
void Locker::inode_hard_write_finish(CInode *in)
{
// drop ref
- assert(in->hardlock.can_write(in->is_auth()));
+ //assert(in->hardlock.can_write(in->is_auth()));
in->hardlock.put_write();
in->auth_unpin();
dout(7) << "inode_hard_write_finish on " << *in << endl;
-
- // drop lock?
- if (in->hardlock.get_nwrite() == 0) {
+ // others waiting?
+ if (in->is_hardlock_write_wanted()) {
+ // wake 'em up
+ in->take_waiting(CINODE_WAIT_HARDW, mds->finished_queue);
+ } else {
// auto-sync if alone.
if (in->is_auth() &&
- !in->is_cached_by_anyone() &&
+ !in->is_replicated() &&
in->hardlock.get_state() != LOCK_SYNC)
in->hardlock.set_state(LOCK_SYNC);
in->hardlock.set_state(LOCK_LOCK);
// waiters
- in->hardlock.get_write();
+ //in->hardlock.get_write();
in->finish_waiting(CINODE_WAIT_HARDRWB|CINODE_WAIT_HARDSTABLE);
- in->hardlock.put_write();
+ //in->hardlock.put_write();
break;
default:
if (in->is_auth()) {
// sync?
- if (in->is_cached_by_anyone() &&
- in->hardlock.get_nwrite() == 0 &&
+ if (in->is_replicated() &&
+ in->is_hardlock_write_wanted() &&
in->hardlock.get_state() != LOCK_SYNC) {
dout(7) << "inode_hard_eval stable, syncing " << *in << endl;
inode_hard_sync(in);
in->encode_hard_state(harddata);
// bcast to replicas
- for (set<int>::iterator it = in->cached_by_begin();
- it != in->cached_by_end();
- it++) {
- MLock *m = new MLock(LOCK_AC_SYNC, mds->get_nodeid());
- m->set_ino(in->ino(), LOCK_OTYPE_IHARD);
- m->set_data(harddata);
- mds->send_message_mds(m, *it, MDS_PORT_LOCKER);
- }
+ send_lock_message(in, LOCK_AC_SYNC, LOCK_OTYPE_IHARD, harddata);
// change lock
in->hardlock.set_state(LOCK_SYNC);
assert(in->hardlock.get_state() == LOCK_SYNC);
// bcast to replicas
- for (set<int>::iterator it = in->cached_by_begin();
- it != in->cached_by_end();
- it++) {
- MLock *m = new MLock(LOCK_AC_LOCK, mds->get_nodeid());
- m->set_ino(in->ino(), LOCK_OTYPE_IHARD);
- mds->send_message_mds(m, *it, MDS_PORT_LOCKER);
- }
+ send_lock_message(in, LOCK_AC_LOCK, LOCK_OTYPE_IHARD);
// change lock
in->hardlock.set_state(LOCK_GLOCKR);
- in->hardlock.init_gather(in->get_cached_by());
+ in->hardlock.init_gather(in->get_replicas());
}
if (in->filelock.can_read(in->is_auth())) {
in->filelock.get_read();
- in->filelock.get_write();
+ //in->filelock.get_write();
in->finish_waiting(CINODE_WAIT_FILERWB|CINODE_WAIT_FILESTABLE);
- in->filelock.put_write();
+ //in->filelock.put_write();
return true;
}
} else {
bool Locker::inode_file_write_start(CInode *in, MClientRequest *m)
{
- // can write? grab ref.
- if (in->filelock.can_write(in->is_auth())) {
- in->filelock.get_write();
- return true;
- }
+ // can't write?
+ if (!in->filelock.can_write(in->is_auth())) {
- // can't write, replicated.
- if (in->is_auth()) {
- // auth
- if (in->filelock.can_write_soon(in->is_auth())) {
- // just wait
- } else {
- if (!in->filelock.is_stable()) {
- dout(7) << "inode_file_write_start on auth, waiting for stable on " << *in << endl;
- in->add_waiter(CINODE_WAIT_FILESTABLE, new C_MDS_RetryRequest(mds, m, in));
- return false;
- }
-
- // initiate lock
- inode_file_lock(in);
-
- if (in->filelock.can_write(in->is_auth())) {
- in->filelock.get_write();
-
- in->filelock.get_read();
- in->finish_waiting(CINODE_WAIT_FILERWB|CINODE_WAIT_FILESTABLE);
- in->filelock.put_read();
- return true;
+ // can't write.
+ if (in->is_auth()) {
+ // auth
+ if (!in->filelock.can_write_soon(in->is_auth())) {
+ if (!in->filelock.is_stable()) {
+ dout(7) << "inode_file_write_start on auth, waiting for stable on " << *in << endl;
+ in->add_waiter(CINODE_WAIT_FILESTABLE, new C_MDS_RetryRequest(mds, m, in));
+ return false;
+ }
+
+ // initiate lock
+ inode_file_lock(in);
+
+ // fall-thru to below.
}
+ } else {
+ // replica
+ // fw to auth
+ int auth = in->authority();
+ dout(7) << "inode_file_write_start " << *in << " on replica, fw to auth " << auth << endl;
+ assert(auth != mds->get_nodeid());
+ mdcache->request_forward(m, auth);
+ return false;
+ }
+ }
+
+ // check again
+ if (in->filelock.can_write(in->is_auth())) {
+ // can i auth pin?
+ assert(in->is_auth());
+ if (!in->can_auth_pin()) {
+ dout(7) << "inode_file_write_start waiting for authpinnable on " << *in << endl;
+ in->add_waiter(CINODE_WAIT_AUTHPINNABLE, new C_MDS_RetryRequest(mds, m, in));
+ return false;
}
+ in->auth_pin();
+ in->filelock.get_write(m);
+ return true;
+ } else {
dout(7) << "inode_file_write_start on auth, waiting for write on " << *in << endl;
in->add_waiter(CINODE_WAIT_FILEW, new C_MDS_RetryRequest(mds, m, in));
return false;
- } else {
- // replica
- // fw to auth
- int auth = in->authority();
- dout(7) << "inode_file_write_start " << *in << " on replica, fw to auth " << auth << endl;
- assert(auth != mds->get_nodeid());
- mdcache->request_forward(m, auth);
- return false;
}
}
dout(7) << "inode_file_write_finish on " << *in << ", filelock=" << in->filelock << endl;
// drop lock?
- if (in->filelock.get_nwrite() == 0) {
+ if (!in->is_filelock_write_wanted()) {
in->finish_waiting(CINODE_WAIT_FILENOWR);
inode_file_eval(in);
}
// waiters
in->filelock.get_read();
- in->filelock.get_write();
+ //in->filelock.get_write();
in->finish_waiting(CINODE_WAIT_FILERWB|CINODE_WAIT_FILESTABLE);
in->filelock.put_read();
- in->filelock.put_write();
+ //in->filelock.put_write();
}
break;
if ((issued & ~(CAP_FILE_WR)) == 0) {
in->filelock.set_state(LOCK_MIXED);
- if (in->is_cached_by_anyone()) {
+ if (in->is_replicated()) {
// data
bufferlist softdata;
in->encode_file_state(softdata);
// bcast to replicas
- for (set<int>::iterator it = in->cached_by_begin();
- it != in->cached_by_end();
- it++) {
- MLock *m = new MLock(LOCK_AC_MIXED, mds->get_nodeid());
- m->set_ino(in->ino(), LOCK_OTYPE_IFILE);
- m->set_data(softdata);
- mds->send_message_mds(m, *it, MDS_PORT_LOCKER);
- }
+ send_lock_message(in, LOCK_AC_MIXED, LOCK_OTYPE_IFILE, softdata);
}
in->finish_waiting(CINODE_WAIT_FILESTABLE);
bufferlist softdata;
in->encode_file_state(softdata);
- for (set<int>::iterator it = in->cached_by_begin();
- it != in->cached_by_end();
- it++) {
- MLock *reply = new MLock(LOCK_AC_SYNC, mds->get_nodeid());
- reply->set_ino(in->ino(), LOCK_OTYPE_IFILE);
- reply->set_data(softdata);
- mds->send_message_mds(reply, *it, MDS_PORT_LOCKER);
- }
+ send_lock_message(in, LOCK_AC_SYNC, LOCK_OTYPE_IFILE, softdata);
}
// waiters
// * -> loner?
if (in->filelock.get_nread() == 0 &&
- in->filelock.get_nwrite() == 0 &&
+ !in->is_filelock_write_wanted() &&
(wanted & CAP_FILE_WR) &&
loner &&
in->filelock.get_state() != LOCK_LONER) {
// * -> mixed?
else if (in->filelock.get_nread() == 0 &&
- in->filelock.get_nwrite() == 0 &&
+ !in->is_filelock_write_wanted() &&
(wanted & CAP_FILE_RD) &&
(wanted & CAP_FILE_WR) &&
!(loner && in->filelock.get_state() == LOCK_LONER) &&
}
// * -> sync?
- else if (in->filelock.get_nwrite() == 0 &&
+ else if (!in->is_filelock_write_wanted() &&
!(wanted & CAP_FILE_WR) &&
((wanted & CAP_FILE_RD) ||
- in->is_cached_by_anyone() ||
+ in->is_replicated() ||
(!loner && in->filelock.get_state() == LOCK_LONER)) &&
in->filelock.get_state() != LOCK_SYNC) {
dout(7) << "inode_file_eval stable, bump to sync " << *in << ", filelock=" << in->filelock << endl;
}
// * -> lock? (if not replicated or open)
- else if (!in->is_cached_by_anyone() &&
+ else if (!in->is_replicated() &&
wanted == 0 &&
in->filelock.get_state() != LOCK_LOCK) {
inode_file_lock(in);
assert((in->get_caps_wanted() & CAP_FILE_WR) == 0);
if (in->filelock.get_state() == LOCK_LOCK) {
- if (in->is_cached_by_anyone()) {
+ if (in->is_replicated()) {
// soft data
bufferlist softdata;
in->encode_file_state(softdata);
// bcast to replicas
- for (set<int>::iterator it = in->cached_by_begin();
- it != in->cached_by_end();
- it++) {
- MLock *m = new MLock(LOCK_AC_SYNC, mds->get_nodeid());
- m->set_ino(in->ino(), LOCK_OTYPE_IFILE);
- m->set_data(softdata);
- mds->send_message_mds(m, *it, MDS_PORT_LOCKER);
- }
+ send_lock_message(in, LOCK_AC_SYNC, LOCK_OTYPE_IFILE, softdata);
}
// change lock
} else {
// no writers, go straight to sync
- if (in->is_cached_by_anyone()) {
+ if (in->is_replicated()) {
// bcast to replicas
- for (set<int>::iterator it = in->cached_by_begin();
- it != in->cached_by_end();
- it++) {
- MLock *m = new MLock(LOCK_AC_SYNC, mds->get_nodeid());
- m->set_ino(in->ino(), LOCK_OTYPE_IFILE);
- mds->send_message_mds(m, *it, MDS_PORT_LOCKER);
- }
+ send_lock_message(in, LOCK_AC_SYNC, LOCK_OTYPE_IFILE);
}
// change lock
issue_caps(in);
} else {
// no writers, go straight to sync
- if (in->is_cached_by_anyone()) {
+ if (in->is_replicated()) {
// bcast to replicas
- for (set<int>::iterator it = in->cached_by_begin();
- it != in->cached_by_end();
- it++) {
- MLock *m = new MLock(LOCK_AC_SYNC, mds->get_nodeid());
- m->set_ino(in->ino(), LOCK_OTYPE_IFILE);
- mds->send_message_mds(m, *it, MDS_PORT_LOCKER);
- }
+ send_lock_message(in, LOCK_AC_SYNC, LOCK_OTYPE_IFILE);
}
// change lock
}
+
void Locker::inode_file_lock(CInode *in)
{
dout(7) << "inode_file_lock " << *in << " filelock=" << in->filelock << endl;
int issued = in->get_caps_issued();
if (in->filelock.get_state() == LOCK_SYNC) {
- if (in->is_cached_by_anyone()) {
+ if (in->is_replicated()) {
// bcast to replicas
- for (set<int>::iterator it = in->cached_by_begin();
- it != in->cached_by_end();
- it++) {
- MLock *m = new MLock(LOCK_AC_LOCK, mds->get_nodeid());
- m->set_ino(in->ino(), LOCK_OTYPE_IFILE);
- mds->send_message_mds(m, *it, MDS_PORT_LOCKER);
- }
- in->filelock.init_gather(in->get_cached_by());
+ send_lock_message(in, LOCK_AC_LOCK, LOCK_OTYPE_IFILE);
+ in->filelock.init_gather(in->get_replicas());
// change lock
in->filelock.set_state(LOCK_GLOCKR);
}
else if (in->filelock.get_state() == LOCK_MIXED) {
- if (in->is_cached_by_anyone()) {
+ if (in->is_replicated()) {
// bcast to replicas
- for (set<int>::iterator it = in->cached_by_begin();
- it != in->cached_by_end();
- it++) {
- MLock *m = new MLock(LOCK_AC_LOCK, mds->get_nodeid());
- m->set_ino(in->ino(), LOCK_OTYPE_IFILE);
- mds->send_message_mds(m, *it, MDS_PORT_LOCKER);
- }
- in->filelock.init_gather(in->get_cached_by());
+ send_lock_message(in, LOCK_AC_LOCK, LOCK_OTYPE_IFILE);
+ in->filelock.init_gather(in->get_replicas());
// change lock
in->filelock.set_state(LOCK_GLOCKM);
int issued = in->get_caps_issued();
if (in->filelock.get_state() == LOCK_SYNC) {
- if (in->is_cached_by_anyone()) {
+ if (in->is_replicated()) {
// bcast to replicas
- for (set<int>::iterator it = in->cached_by_begin();
- it != in->cached_by_end();
- it++) {
- MLock *m = new MLock(LOCK_AC_MIXED, mds->get_nodeid());
- m->set_ino(in->ino(), LOCK_OTYPE_IFILE);
- mds->send_message_mds(m, *it, MDS_PORT_LOCKER);
- }
- in->filelock.init_gather(in->get_cached_by());
+ send_lock_message(in, LOCK_AC_MIXED, LOCK_OTYPE_IFILE);
+ in->filelock.init_gather(in->get_replicas());
in->filelock.set_state(LOCK_GMIXEDR);
issue_caps(in);
}
else if (in->filelock.get_state() == LOCK_LOCK) {
- if (in->is_cached_by_anyone()) {
+ if (in->is_replicated()) {
// data
bufferlist softdata;
in->encode_file_state(softdata);
// bcast to replicas
- for (set<int>::iterator it = in->cached_by_begin();
- it != in->cached_by_end();
- it++) {
- MLock *m = new MLock(LOCK_AC_MIXED, mds->get_nodeid());
- m->set_ino(in->ino(), LOCK_OTYPE_IFILE);
- m->set_data(softdata);
- mds->send_message_mds(m, *it, MDS_PORT_LOCKER);
- }
+ send_lock_message(in, LOCK_AC_MIXED, LOCK_OTYPE_IFILE, softdata);
}
// change lock
in->filelock.set_state(LOCK_GMIXEDL);
issue_caps(in);
}
- else if (in->is_cached_by_anyone()) {
+ else if (in->is_replicated()) {
// bcast to replicas
- for (set<int>::iterator it = in->cached_by_begin();
- it != in->cached_by_end();
- it++) {
- MLock *m = new MLock(LOCK_AC_MIXED, mds->get_nodeid());
- m->set_ino(in->ino(), LOCK_OTYPE_IFILE);
- mds->send_message_mds(m, *it, MDS_PORT_LOCKER);
- }
+ send_lock_message(in, LOCK_AC_MIXED, LOCK_OTYPE_IFILE);
in->filelock.set_state(LOCK_MIXED);
issue_caps(in);
} else {
assert((in->client_caps.size() == 1) && in->mds_caps_wanted.empty());
if (in->filelock.get_state() == LOCK_SYNC) {
- if (in->is_cached_by_anyone()) {
+ if (in->is_replicated()) {
// bcast to replicas
- for (set<int>::iterator it = in->cached_by_begin();
- it != in->cached_by_end();
- it++) {
- MLock *m = new MLock(LOCK_AC_LOCK, mds->get_nodeid());
- m->set_ino(in->ino(), LOCK_OTYPE_IFILE);
- mds->send_message_mds(m, *it, MDS_PORT_LOCKER);
- }
- in->filelock.init_gather(in->get_cached_by());
+ send_lock_message(in, LOCK_AC_LOCK, LOCK_OTYPE_IFILE);
+ in->filelock.init_gather(in->get_replicas());
// change lock
in->filelock.set_state(LOCK_GLONERR);
}
else if (in->filelock.get_state() == LOCK_MIXED) {
- if (in->is_cached_by_anyone()) {
+ if (in->is_replicated()) {
// bcast to replicas
- for (set<int>::iterator it = in->cached_by_begin();
- it != in->cached_by_end();
- it++) {
- MLock *m = new MLock(LOCK_AC_LOCK, mds->get_nodeid());
- m->set_ino(in->ino(), LOCK_OTYPE_IFILE);
- mds->send_message_mds(m, *it, MDS_PORT_LOCKER);
- }
- in->filelock.init_gather(in->get_cached_by());
+ send_lock_message(in, LOCK_AC_LOCK, LOCK_OTYPE_IFILE);
+ in->filelock.init_gather(in->get_replicas());
// change lock
in->filelock.set_state(LOCK_GLONERM);
issue_caps(in);
// waiters
- in->filelock.get_write();
+ //in->filelock.get_write();
in->finish_waiting(CINODE_WAIT_FILEW|CINODE_WAIT_FILESTABLE);
- in->filelock.put_write();
+ //in->filelock.put_write();
inode_file_eval(in);
break;
// mine!
dn->xlockedby = m;
- if (dn->dir->is_open_by_anyone()) {
+ if (dn->is_replicated()) {
dn->lockstate = DN_LOCK_PREXLOCK;
// xlock with whom?
- set<int> who = dn->dir->get_open_by();
+ set<int> who;
+ for (map<int,int>::iterator p = dn->replicas_begin();
+ p != dn->replicas_end();
+ ++p)
+ who.insert(p->first);
dn->gather_set = who;
// make path
// tell replicas?
if (!quiet) {
// tell even if dn is null.
- if (dn->dir->is_open_by_anyone()) {
- for (set<int>::iterator it = dn->dir->open_by_begin();
- it != dn->dir->open_by_end();
- it++) {
- MLock *m = new MLock(LOCK_AC_SYNC, mds->get_nodeid());
- m->set_dn(dn->dir->ino(), dn->name);
- mds->send_message_mds(m, *it, MDS_PORT_LOCKER);
- }
+ if (dn->is_replicated()) {
+ send_lock_message(dn, LOCK_AC_SYNC);
}
}
void dispatch(Message *m);
+ void send_lock_message(CInode *in, int msg, int type);
+ void send_lock_message(CInode *in, int msg, int type, bufferlist& data);
+ void send_lock_message(CDentry *dn, int msg);
+
// -- locks --
// high level interface
public:
// events i know of
#include "events/EString.h"
-#include "events/EInodeUpdate.h"
-#include "events/EDirUpdate.h"
+#include "events/EImportMap.h"
+#include "events/EMetaBlob.h"
+#include "events/EUpdate.h"
#include "events/EUnlink.h"
#include "events/EAlloc.h"
-#include "events/EMknod.h"
-#include "events/EMkdir.h"
#include "events/EPurgeFinish.h"
+#include "events/EExportStart.h"
+#include "events/EExportFinish.h"
+#include "events/EImportStart.h"
+#include "events/EImportFinish.h"
LogEvent *LogEvent::decode(bufferlist& bl)
{
// create event
LogEvent *le;
switch (type) {
- case EVENT_STRING: // string
- le = new EString();
- break;
-
- case EVENT_INODEUPDATE:
- le = new EInodeUpdate();
- break;
-
- case EVENT_DIRUPDATE:
- le = new EDirUpdate();
- break;
-
- case EVENT_UNLINK:
- le = new EUnlink();
- break;
-
- case EVENT_PURGEFINISH:
- le = new EPurgeFinish();
- break;
-
- case EVENT_ALLOC:
- le = new EAlloc();
- break;
-
- case EVENT_MKNOD:
- le = new EMknod();
- break;
-
- case EVENT_MKDIR:
- le = new EMkdir();
- break;
-
+ case EVENT_STRING: le = new EString(); break;
+ case EVENT_IMPORTMAP: le = new EImportMap; break;
+ case EVENT_UPDATE: le = new EUpdate; break;
+ case EVENT_UNLINK: le = new EUnlink(); break;
+ case EVENT_PURGEFINISH: le = new EPurgeFinish(); break;
+ case EVENT_ALLOC: le = new EAlloc(); break;
+ case EVENT_EXPORTSTART: le = new EExportStart; break;
+ case EVENT_EXPORTFINISH: le = new EExportFinish; break;
+ case EVENT_IMPORTSTART: le = new EImportStart; break;
+ case EVENT_IMPORTFINISH: le = new EImportFinish; break;
default:
- dout(1) << "uh oh, unknown event type " << type << endl;
+ dout(1) << "uh oh, unknown log event type " << type << endl;
assert(0);
}
#define EVENT_INODEUPDATE 2
#define EVENT_DIRUPDATE 3
+#define EVENT_IMPORTMAP 4
+#define EVENT_UPDATE 5
+
#define EVENT_ALLOC 10
#define EVENT_MKNOD 11
#define EVENT_MKDIR 12
#define EVENT_RMDIR 21
#define EVENT_PURGEFINISH 22
+#define EVENT_EXPORTSTART 30
+#define EVENT_EXPORTFINISH 31
+#define EVENT_IMPORTSTART 32
+#define EVENT_IMPORTFINISH 33
+
+
#include <string>
using namespace std;
class LogEvent {
private:
int _type;
- off_t _end_off;
+ off_t _start_off,_end_off;
friend class MDLog;
public:
- LogEvent(int t) : _type(t), _end_off(0) { }
+ LogEvent(int t) : _type(t), _start_off(0), _end_off(0) { }
virtual ~LogEvent() { }
+ int get_type() { return _type; }
+ off_t get_start_off() { return _start_off; }
+ off_t get_end_off() { return _end_off; }
+
// encoding
virtual void encode_payload(bufferlist& bl) = 0;
virtual void decode_payload(bufferlist& bl, int& off) = 0;
/* obsolete() - is this entry committed to primary store, such that
* we can expire it from the journal?
*/
- virtual bool can_expire(MDS *m) {
+ virtual bool has_expired(MDS *m) {
return true;
}
- /* retire() - prod MDS into committing hte relevant state so that this
+ /* expire() - prod MDS into committing the relevant state so that this
* entry can be expired from the jorunal.
*/
- virtual void retire(MDS *m, Context *c) {
+ virtual void expire(MDS *m, Context *c) {
+ assert(0);
c->finish(0);
delete c;
}
/*** recovery ***/
-
- /* has_happened() - true if this event has already been applied.
- */
- virtual bool has_happened(MDS *m) { return true; }
-
- /* replay() - replay given event
+ /* replay() - replay given event. this is idempotent.
*/
virtual void replay(MDS *m) { assert(0); }
#include "config.h"
#undef dout
-#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_mds_balancer) cout << g_clock.now() << " mds" << mds->get_nodeid() << ".bal "
+#define dout(l) if (l<=g_conf.debug_mds || l<=g_conf.debug_mds_balancer) cout << g_clock.now() << " mds" << mds->get_nodeid() << ".bal "
#define MIN_LOAD 50 // ??
#define MIN_REEXPORT 5 // will automatically reexport
#define MIN_OFFLOAD 10 // point at which i stop trying, close enough
+
int MDBalancer::proc_message(Message *m)
{
switch (m->get_type()) {
}
+
+
+void MDBalancer::tick()
+{
+ static int num_bal_times = g_conf.mds_bal_max;
+ static utime_t first = g_clock.now();
+ utime_t now = g_clock.now();
+ utime_t elapsed = now;
+ elapsed -= first;
+
+ // balance?
+ if (true &&
+ mds->get_nodeid() == 0 &&
+ (num_bal_times ||
+ (g_conf.mds_bal_max_until >= 0 &&
+ elapsed.sec() > g_conf.mds_bal_max_until)) &&
+ mds->is_active() &&
+ now.sec() - last_heartbeat.sec() >= g_conf.mds_bal_interval) {
+ last_heartbeat = now;
+ send_heartbeat();
+ num_bal_times--;
+ }
+
+ // hash?
+ if (true &&
+ g_conf.num_mds > 1 &&
+ now.sec() - last_hash.sec() > g_conf.mds_bal_hash_interval) {
+ last_hash = now;
+ do_hashing();
+ }
+}
+
+
+
+
class C_Bal_SendHeartbeat : public Context {
public:
MDS *mds;
}
- int size = mds->get_mds_map()->get_num_mds();
- for (int i = 0; i<size; i++) {
- if (i == mds->get_nodeid()) continue;
+ set<int> up;
+ mds->get_mds_map()->get_up_mds_set(up);
+ for (set<int>::iterator p = up.begin(); p != up.end(); ++p) {
+ if (*p == mds->get_nodeid()) continue;
MHeartbeat *hb = new MHeartbeat(load, beat_epoch);
hb->get_import_map() = import_map;
mds->messenger->send_message(hb,
- MSG_ADDR_MDS(i), mds->mdsmap->get_inst(i),
- MDS_PORT_BALANCER,
- MDS_PORT_BALANCER);
+ mds->mdsmap->get_inst(*p),
+ MDS_PORT_BALANCER, MDS_PORT_BALANCER);
}
}
<< " pop " << (*it)->popularity[MDS_POP_CURDOM].meta_load()
<< endl;
mds->mdcache->migrator->export_dir(*it, target);
+
+ // hack! only do one dir.
+ break;
}
}
void MDBalancer::show_imports(bool external)
{
- int db = 20; //debug level
- return;
-
- if (mds->mdcache->imports.empty() &&
- mds->mdcache->hashdirs.empty()) {
- dout(db) << "no imports/exports/hashdirs" << endl;
- return;
- }
- dout(db) << "imports/exports/hashdirs:" << endl;
-
- set<CDir*> ecopy = mds->mdcache->exports;
-
- set<CDir*>::iterator it = mds->mdcache->hashdirs.begin();
- while (1) {
- if (it == mds->mdcache->hashdirs.end()) it = mds->mdcache->imports.begin();
- if (it == mds->mdcache->imports.end() ) break;
-
- CDir *im = *it;
-
- if (im->is_import()) {
- dout(db) << " + import (" << im->popularity[MDS_POP_CURDOM] << "/" << im->popularity[MDS_POP_ANYDOM] << ") " << *im << endl;
- assert( im->is_auth() );
- }
- else if (im->is_hashed()) {
- if (im->is_import()) continue; // if import AND hash, list as import.
- dout(db) << " + hash (" << im->popularity[MDS_POP_CURDOM] << "/" << im->popularity[MDS_POP_ANYDOM] << ") " << *im << endl;
- }
-
- for (set<CDir*>::iterator p = mds->mdcache->nested_exports[im].begin();
- p != mds->mdcache->nested_exports[im].end();
- p++) {
- CDir *exp = *p;
- if (exp->is_hashed()) {
- //assert(0); // we don't do it this way actually
- dout(db) << " - hash (" << exp->popularity[MDS_POP_NESTED] << ", " << exp->popularity[MDS_POP_ANYDOM] << ") " << *exp << " to " << exp->dir_auth << endl;
- assert( !exp->is_auth() );
- } else {
- dout(db) << " - ex (" << exp->popularity[MDS_POP_NESTED] << ", " << exp->popularity[MDS_POP_ANYDOM] << ") " << *exp << " to " << exp->dir_auth << endl;
- assert( exp->is_export() );
- assert( !exp->is_auth() );
- }
-
- if ( mds->mdcache->get_auth_container(exp) != im ) {
- dout(1) << "uh oh, auth container is " << mds->mdcache->get_auth_container(exp) << endl;
- dout(1) << "uh oh, auth container is " << *mds->mdcache->get_auth_container(exp) << endl;
- assert( mds->mdcache->get_auth_container(exp) == im );
- }
-
- if (ecopy.count(exp) != 1) {
- dout(1) << "***** nested_export " << *exp << " not in exports" << endl;
- assert(0);
- }
- ecopy.erase(exp);
- }
-
- it++;
- }
-
- if (ecopy.size()) {
- for (set<CDir*>::iterator it = ecopy.begin();
- it != ecopy.end();
- it++)
- dout(1) << "***** stray item in exports: " << **it << endl;
- assert(ecopy.size() == 0);
- }
+ mds->mdcache->show_imports();
}
class MDBalancer {
protected:
MDS *mds;
-
int beat_epoch;
+ utime_t last_heartbeat;
+ utime_t last_hash;
+
// todo
set<inodeno_t> hash_queue;
}
public:
- MDBalancer(MDS *m) {
- mds = m;
- beat_epoch = 0;
- }
+ MDBalancer(MDS *m) :
+ mds(m),
+ beat_epoch(0) { }
mds_load_t get_load();
void send_heartbeat();
void handle_heartbeat(MHeartbeat *m);
+ void tick();
+
void do_hashing();
void export_empties();
#include "osdc/Filer.h"
+#include "events/EImportMap.h"
+#include "events/EString.h"
#include "events/EUnlink.h"
#include "events/EPurgeFinish.h"
#include "messages/MGenericMessage.h"
+
+#include "messages/MMDSImportMap.h"
+#include "messages/MMDSCacheRejoin.h"
+#include "messages/MMDSCacheRejoinAck.h"
+
#include "messages/MDiscover.h"
#include "messages/MDiscoverReply.h"
lru.lru_set_midpoint(g_conf.mds_cache_mid);
did_shutdown_exports = false;
+ did_shutdown_log_cap = false;
shutdown_commits = 0;
}
}
+
void MDCache::log_stat(Logger *logger)
{
if (get_root()) {
void MDCache::add_inode(CInode *in)
{
// add to lru, inode map
- assert(inode_map.size() == lru.lru_get_size());
- lru.lru_insert_mid(in);
assert(inode_map.count(in->ino()) == 0); // should be no dup inos!
inode_map[ in->ino() ] = in;
- assert(inode_map.size() == lru.lru_get_size());
}
void MDCache::remove_inode(CInode *o)
dn->dir->unlink_inode(dn); // leave dentry
}
inode_map.erase(o->ino()); // remove from map
- lru.lru_remove(o); // remove from lru
+}
+
+
+/*
+ * take note of where we write import_maps in the log, as we need
+ * to take care not to expire them until an updated map is safely flushed.
+ */
+class C_MDS_WroteImportMap : public Context {
+ MDLog *mdlog;
+ off_t end_off;
+public:
+ C_MDS_WroteImportMap(MDLog *ml, off_t eo) : mdlog(ml), end_off(eo) { }
+ void finish(int r) {
+ // cout << "WroteImportMap at " << end_off << endl;
+ if (r >= 0)
+ mdlog->last_import_map = end_off;
+ mdlog->writing_import_map = false;
+ }
+};
+
+
+
+void MDCache::log_import_map(Context *onsync)
+{
+ dout(10) << "log_import_map " << imports.size() << " imports, "
+ << exports.size() << " exports" << endl;
+
+ EImportMap *le = new EImportMap;
+
+ // include import/export inodes,
+ // and a spanning tree to tie it to the root of the fs
+ for (set<CDir*>::iterator p = imports.begin();
+ p != imports.end();
+ p++) {
+ CDir *im = *p;
+ le->imports.insert(im->ino());
+ le->metablob.add_dir_context(im, true);
+ le->metablob.add_dir(im, false);
+
+ if (nested_exports.count(im)) {
+ for (set<CDir*>::iterator q = nested_exports[im].begin();
+ q != nested_exports[im].end();
+ ++q) {
+ CDir *ex = *q;
+ le->nested_exports[im->ino()].insert(ex->ino());
+ le->exports.insert(ex->ino());
+ le->metablob.add_dir_context(ex);
+ le->metablob.add_dir(ex, false);
+ }
+ }
+ }
+
+ mds->mdlog->writing_import_map = true;
+ mds->mdlog->submit_entry(le);
+ mds->mdlog->wait_for_sync(new C_MDS_WroteImportMap(mds->mdlog, mds->mdlog->get_write_pos()));
+ if (onsync)
+ mds->mdlog->wait_for_sync(onsync);
+}
+
+
+
+
+
+// =====================
+// recovery stuff
+
+void MDCache::send_pending_import_maps()
+{
+ if (wants_import_map.empty())
+ return; // nothing to send.
+
+ // only if it's appropriate!
+ if (migrator->is_exporting()) {
+ dout(7) << "send_pending_import_maps waiting, exports still in progress" << endl;
+ return; // not now
+ }
+
+ // ok, send them.
+ for (set<int>::iterator p = wants_import_map.begin();
+ p != wants_import_map.end();
+ p++)
+ send_import_map_now(*p);
+ wants_import_map.clear();
+}
+
+void MDCache::send_import_map(int who)
+{
+ if (migrator->is_exporting())
+ send_import_map_later(who);
+ else
+ send_import_map_now(who);
+}
+
+void MDCache::send_import_map_now(int who)
+{
+ dout(10) << "send_import_map to mds" << who << endl;
+
+ MMDSImportMap *m = new MMDSImportMap;
+
+ // known
+ for (set<CDir*>::iterator p = imports.begin();
+ p != imports.end();
+ p++) {
+ CDir *im = *p;
+
+ if (migrator->is_importing(im->ino())) {
+ // ambiguous (mid-import)
+ m->add_ambiguous_import(im->ino(),
+ migrator->get_import_bounds(im->ino()));
+ } else {
+ // not ambiguous.
+ m->add_import(im->ino());
+
+ if (nested_exports.count(im)) {
+ for (set<CDir*>::iterator q = nested_exports[im].begin();
+ q != nested_exports[im].end();
+ ++q) {
+ CDir *ex = *q;
+ m->add_import_export(im->ino(), ex->ino());
+ }
+ }
+ }
+ }
+
+ // ambiguous
+ for (map<inodeno_t, set<inodeno_t> >::iterator p = my_ambiguous_imports.begin();
+ p != my_ambiguous_imports.end();
+ ++p)
+ m->add_ambiguous_import(p->first, p->second);
+
+ // second
+ mds->send_message_mds(m, who, MDS_PORT_CACHE);
}
+/*
+ * during resolve state, we share import_maps to determine who
+ * is authoritative for which trees. we expect to get an import_map
+ * from _everyone_ in the recovery_set (the mds cluster at the time of
+ * the first failure).
+ */
+void MDCache::handle_import_map(MMDSImportMap *m)
+{
+ dout(7) << "handle_import_map from " << m->get_source() << endl;
+ int from = m->get_source().num();
+
+ // FIXME: check if we are a surviving ambiguous importer
+
+ // update my dir_auth values
+ for (map<inodeno_t, set<inodeno_t> >::iterator pi = m->imap.begin();
+ pi != m->imap.end();
+ ++pi) {
+ CInode *imi = get_inode(pi->first);
+ if (!imi) continue;
+ CDir *im = imi->dir;
+ if (!im) continue;
+
+ im->set_dir_auth(from);
+
+ for (set<inodeno_t>::iterator pe = pi->second.begin();
+ pe != pi->second.end();
+ ++pe) {
+ CInode *exi = get_inode(*pe);
+ if (!exi) continue;
+ CDir *ex = exi->dir;
+ if (!ex) continue;
+
+ if (ex->get_dir_auth() == CDIR_AUTH_PARENT)
+ ex->set_dir_auth(CDIR_AUTH_UNKNOWN);
+ }
+ }
+
+ // note ambiguous imports too
+ for (map<inodeno_t, set<inodeno_t> >::iterator pi = m->ambiguous_imap.begin();
+ pi != m->ambiguous_imap.end();
+ ++pi)
+ mds->mdcache->other_ambiguous_imports[from][pi->first].swap( pi->second );
+
+ // did i get them all?
+ got_import_map.insert(from);
+
+ if (got_import_map == recovery_set) {
+ dout(10) << "got all import maps, ready to rejoin" << endl;
+ disambiguate_imports();
+ recalc_auth_bits();
+ trim_non_auth();
+
+ // move to rejoin state
+ mds->set_want_state(MDSMap::STATE_REJOIN);
+
+ } else {
+ dout(10) << "still waiting for more importmaps, got " << got_import_map
+ << ", need " << recovery_set << endl;
+ }
+
+ delete m;
+}
+
+
+void MDCache::disambiguate_imports()
+{
+ dout(10) << "disambiguate_imports" << endl;
+
+ // other nodes' ambiguous imports
+ for (map<int, map<inodeno_t, set<inodeno_t> > >::iterator p = other_ambiguous_imports.begin();
+ p != other_ambiguous_imports.begin();
+ ++p) {
+ int who = p->first;
+
+ for (map<inodeno_t, set<inodeno_t> >::iterator q = p->second.begin();
+ q != p->second.end();
+ ++q) {
+ CInode *diri = get_inode(q->first);
+ if (!diri) continue;
+ CDir *dir = diri->dir;
+ if (!dir) continue;
+
+ if (dir->authority() >= CDIR_AUTH_UNKNOWN) {
+ dout(10) << "mds" << who << " did not import " << *dir << endl;
+ } else {
+ dout(10) << "mds" << who << " did import " << *dir << endl;
+ int was = dir->authority();
+ dir->set_dir_auth(who);
+
+ for (set<inodeno_t>::iterator r = q->second.begin();
+ r != q->second.end();
+ ++r) {
+ CInode *exi = get_inode(q->first);
+ if (!exi) continue;
+ CDir *ex = exi->dir;
+ if (!ex) continue;
+ if (ex->get_dir_auth() == CDIR_AUTH_PARENT)
+ ex->set_dir_auth(was);
+ dout(10) << " bound " << *ex << endl;
+ }
+ }
+ }
+ }
+ other_ambiguous_imports.clear();
+
+ // my ambiguous imports
+ while (!my_ambiguous_imports.empty()) {
+ map<inodeno_t, set<inodeno_t> >::iterator q = my_ambiguous_imports.begin();
+
+ CInode *diri = get_inode(q->first);
+ if (!diri) continue;
+ CDir *dir = diri->dir;
+ if (!dir) continue;
+
+ if (dir->authority() != CDIR_AUTH_UNKNOWN) {
+ dout(10) << "ambiguous import auth known, must not be me " << *dir << endl;
+ cancel_ambiguous_import(q->first);
+ } else {
+ dout(10) << "ambiguous import auth unknown, must be me " << *dir << endl;
+ finish_ambiguous_import(q->first);
+ }
+ }
+ assert(my_ambiguous_imports.empty());
+
+ show_imports();
+}
+
+void MDCache::cancel_ambiguous_import(inodeno_t dirino)
+{
+ assert(my_ambiguous_imports.count(dirino));
+ dout(10) << "cancel_ambiguous_import " << dirino
+ << " bounds " << my_ambiguous_imports[dirino]
+ << endl;
+ my_ambiguous_imports.erase(dirino);
+}
+
+void MDCache::finish_ambiguous_import(inodeno_t dirino)
+{
+ assert(my_ambiguous_imports.count(dirino));
+ set<inodeno_t> bounds;
+ bounds.swap(my_ambiguous_imports[dirino]);
+ my_ambiguous_imports.erase(dirino);
+
+ dout(10) << "finish_ambiguous_import " << dirino
+ << " bounds " << bounds
+ << endl;
+
+ CInode *diri = get_inode(dirino);
+ assert(diri);
+ CDir *dir = diri->dir;
+ assert(dir);
+
+ // adjust dir_auth
+ CDir *im = dir;
+ if (dir->get_inode()->authority() == mds->get_nodeid()) {
+ // parent is already me. adding to existing import.
+ im = get_auth_container(dir);
+ if (!im) im = dir;
+ nested_exports[im].erase(dir);
+ exports.erase(dir);
+ dir->set_dir_auth( CDIR_AUTH_PARENT );
+ dir->state_clear(CDIR_STATE_EXPORT);
+ dir->put(CDir::PIN_EXPORT);
+ } else {
+ // parent isn't me. new import.
+ imports.insert(dir);
+ dir->set_dir_auth( mds->get_nodeid() );
+ dir->state_set(CDIR_STATE_IMPORT);
+ dir->get(CDir::PIN_IMPORT);
+ }
+
+ dout(10) << " base " << *dir << endl;
+ if (dir != im)
+ dout(10) << " under " << *im << endl;
+
+ // bounds (exports, before)
+ for (set<inodeno_t>::iterator p = bounds.begin();
+ p != bounds.end();
+ ++p) {
+ CInode *bi = get_inode(*p);
+ assert(bi);
+ CDir *bd = bi->dir;
+ assert(bd);
+
+ if (bd->get_dir_auth() == mds->get_nodeid()) {
+ // still me. was an import.
+ imports.erase(bd);
+ bd->set_dir_auth( CDIR_AUTH_PARENT );
+ bd->state_clear(CDIR_STATE_IMPORT);
+ bd->put(CDir::PIN_IMPORT);
+ // move nested exports.
+ for (set<CDir*>::iterator q = nested_exports[bd].begin();
+ q != nested_exports[bd].end();
+ ++q)
+ nested_exports[im].insert(*q);
+ nested_exports.erase(bd);
+
+ } else {
+ // not me anymore. now an export.
+ exports.insert(bd);
+ nested_exports[im].insert(bd);
+ assert(bd->get_dir_auth() != CDIR_AUTH_PARENT);
+ bd->set_dir_auth( CDIR_AUTH_UNKNOWN );
+ bd->state_set(CDIR_STATE_EXPORT);
+ bd->get(CDir::PIN_EXPORT);
+ }
+
+ dout(10) << " bound " << *bd << endl;
+ }
+}
+
+void MDCache::finish_ambiguous_export(inodeno_t dirino, set<inodeno_t>& bounds)
+{
+ CInode *diri = get_inode(dirino);
+ assert(diri);
+ CDir *dir = diri->dir;
+ assert(dir);
+
+ dout(10) << "finish_ambiguous_export " << dirino
+ << " bounds " << bounds
+ << endl;
+
+ // adjust dir_auth
+ CDir *im = get_auth_container(dir);
+ if (dir->get_inode()->authority() == CDIR_AUTH_UNKNOWN) {
+ // was an import, hose it
+ assert(im == dir);
+ assert(imports.count(dir));
+ imports.erase(dir);
+ dir->set_dir_auth( CDIR_AUTH_PARENT );
+ dir->state_clear(CDIR_STATE_IMPORT);
+ dir->put(CDir::PIN_IMPORT);
+ } else {
+ // i'm now an export
+ exports.insert(dir);
+ nested_exports[im].insert(dir);
+ dir->set_dir_auth( CDIR_AUTH_UNKNOWN ); // not me
+ dir->state_set(CDIR_STATE_EXPORT);
+ dir->get(CDir::PIN_EXPORT);
+ }
+ dout(10) << " base " << *dir << endl;
+ if (dir != im)
+ dout(10) << " under " << *im << endl;
+
+ // bounds (there were exports, before)
+ for (set<inodeno_t>::iterator p = bounds.begin();
+ p != bounds.end();
+ ++p) {
+ CInode *bi = get_inode(*p);
+ assert(bi);
+ CDir *bd = bi->dir;
+ assert(bd);
+
+ // hose export
+ assert(exports.count(bd));
+ exports.erase(bd);
+ nested_exports[im].erase(bd);
+
+ // fix dir_auth
+ assert(bd->get_dir_auth() != CDIR_AUTH_PARENT);
+ bd->set_dir_auth( CDIR_AUTH_PARENT ); // not me
+
+ bd->state_clear(CDIR_STATE_EXPORT);
+ bd->put(CDir::PIN_EXPORT);
+
+ dout(10) << " bound " << *bd << endl;
+ }
+
+ show_imports();
+}
+
+
+
+
+/*
+ * rejoin phase!
+ * we start out by sending rejoins to everyone in the recovery set.
+ *
+ * if _were_ are rejoining, send for all regions in our cache.
+ * if we are active|stopping, send only to nodes that are are rejoining.
+ */
+void MDCache::send_cache_rejoins()
+{
+ dout(10) << "send_cache_rejoins " << endl;
+
+ map<int, MMDSCacheRejoin*> rejoins;
+
+ // if i am rejoining, send a rejoin to everyone.
+ // otherwise, just send to others who are rejoining.
+ for (set<int>::iterator p = recovery_set.begin();
+ p != recovery_set.end();
+ ++p) {
+ if (*p == mds->get_nodeid()) continue; // nothing to myself!
+ if (mds->is_rejoin() ||
+ mds->mdsmap->is_rejoin(*p))
+ rejoins[*p] = new MMDSCacheRejoin;
+ }
+
+ // build list of dir_auth regions
+ list<CDir*> dir_auth_regions;
+ for (hash_map<inodeno_t,CInode*>::iterator p = inode_map.begin();
+ p != inode_map.end();
+ ++p) {
+ if (!p->second->is_dir()) continue;
+ if (!p->second->dir) continue;
+ if (p->second->dir->get_dir_auth() == CDIR_AUTH_PARENT) continue;
+
+ int auth = p->second->dir->get_dir_auth();
+ assert(auth >= 0);
+
+ if (auth == mds->get_nodeid()) continue; // skip my own regions!
+
+ if (rejoins.count(auth) == 0)
+ continue; // don't care about this node's regions
+
+ // add to list
+ dout(10) << " on mds" << auth << " region " << *p->second << endl;
+ dir_auth_regions.push_back(p->second->dir);
+ }
+
+ // walk the regions
+ for (list<CDir*>::iterator p = dir_auth_regions.begin();
+ p != dir_auth_regions.end();
+ ++p) {
+ CDir *dir = *p;
+ int to = dir->authority();
+ cache_rejoin_walk(dir, rejoins[to]);
+ }
+
+ // send the messages
+ assert(rejoin_ack_gather.empty());
+ for (map<int,MMDSCacheRejoin*>::iterator p = rejoins.begin();
+ p != rejoins.end();
+ ++p) {
+ mds->send_message_mds(p->second, p->first, MDS_PORT_CACHE);
+ rejoin_ack_gather.insert(p->first);
+ }
+
+ // nothing?
+ if (rejoins.empty()) {
+ dout(10) << "nothing to rejoin, going active" << endl;
+ mds->set_want_state(MDSMap::STATE_ACTIVE);
+ }
+}
+
+
+
+void MDCache::cache_rejoin_walk(CDir *dir, MMDSCacheRejoin *rejoin)
+{
+ dout(10) << "cache_rejoin_walk " << *dir << endl;
+ rejoin->add_dir(dir->ino());
+
+ list<CDir*> nested; // finish this dir, then do nested items
+
+ // walk dentries
+ for (map<string,CDentry*>::iterator p = dir->items.begin();
+ p != dir->items.end();
+ ++p) {
+ // dentry
+ rejoin->add_dentry(dir->ino(), p->first);
+
+ // inode?
+ if (p->second->is_primary() && p->second->get_inode()) {
+ CInode *in = p->second->get_inode();
+ rejoin->add_inode(in->ino(),
+ in->get_caps_wanted());
+
+ // dir?
+ if (in->dir &&
+ in->dir->get_dir_auth() == CDIR_AUTH_PARENT)
+ nested.push_back(in->dir);
+ }
+ }
+
+ // recurse into nested dirs
+ for (list<CDir*>::iterator p = nested.begin();
+ p != nested.end();
+ ++p)
+ cache_rejoin_walk(*p, rejoin);
+}
+
+
+/*
+ * i got a rejoin.
+ *
+ * - reply with the lockstate
+ *
+ * if i am active|stopping,
+ * - remove source from replica list for everything not referenced here.
+ */
+void MDCache::handle_cache_rejoin(MMDSCacheRejoin *m)
+{
+ dout(7) << "handle_cache_rejoin from " << m->get_source() << endl;
+ int from = m->get_source().num();
+
+ MMDSCacheRejoinAck *ack = new MMDSCacheRejoinAck;
+
+ if (mds->is_active() || mds->is_stopping()) {
+ dout(10) << "removing stale cache replicas" << endl;
+ // first, scour cache of replica references
+ for (hash_map<inodeno_t,CInode*>::iterator p = inode_map.begin();
+ p != inode_map.end();
+ ++p) {
+ // inode
+ CInode *in = p->second;
+ if (in->is_replica(from) && m->inodes.count(p->first) == 0) {
+ inode_remove_replica(in, from);
+ dout(10) << " rem " << *in << endl;
+ }
+
+ // dentry
+ if (in->parent) {
+ CDentry *dn = in->parent;
+ if (dn->is_replica(from) &&
+ (m->dentries.count(dn->get_dir()->ino()) == 0 ||
+ m->dentries[dn->get_dir()->ino()].count(dn->get_name()) == 0)) {
+ dn->remove_replica(from);
+ dout(10) << " rem " << *dn << endl;
+ }
+ }
+
+ // dir
+ if (in->dir) {
+ CDir *dir = in->dir;
+ if (dir->is_replica(from) && m->dirs.count(p->first) == 0) {
+ dir->remove_replica(from);
+ dout(10) << " rem " << *dir << endl;
+ }
+ }
+ }
+ } else {
+ assert(mds->is_rejoin());
+ }
+
+ // dirs
+ for (set<inodeno_t>::iterator p = m->dirs.begin();
+ p != m->dirs.end();
+ ++p) {
+ CInode *diri = get_inode(*p);
+ assert(diri);
+ CDir *dir = diri->dir;
+ assert(dir);
+ int nonce = dir->add_replica(from);
+ dout(10) << " has " << *dir << endl;
+ ack->add_dir(*p, nonce);
+
+ // dentries
+ for (set<string>::iterator q = m->dentries[*p].begin();
+ q != m->dentries[*p].end();
+ ++q) {
+ CDentry *dn = dir->lookup(*q);
+ assert(dn);
+ int nonce = dn->add_replica(from);
+ dout(10) << " has " << *dn << endl;
+ ack->add_dentry(*p, *q, dn->get_lockstate(), nonce);
+ }
+ }
+
+ // inodes
+ for (map<inodeno_t,int>::iterator p = m->inodes.begin();
+ p != m->inodes.end();
+ ++p) {
+ CInode *in = get_inode(p->first);
+ assert(in);
+ int nonce = in->add_replica(from);
+ if (p->second)
+ in->mds_caps_wanted[from] = p->second;
+ else
+ in->mds_caps_wanted.erase(from);
+ in->hardlock.gather_set.erase(from); // just in case
+ in->filelock.gather_set.erase(from); // just in case
+ dout(10) << " has " << *in << endl;
+ ack->add_inode(p->first,
+ in->hardlock.get_replica_state(), in->filelock.get_replica_state(),
+ nonce);
+ }
+
+ // send ack
+ mds->send_message_mds(ack, from, MDS_PORT_CACHE);
+
+ delete m;
+}
+
+
+void MDCache::handle_cache_rejoin_ack(MMDSCacheRejoinAck *m)
+{
+ dout(7) << "handle_cache_rejoin from " << m->get_source() << endl;
+ int from = m->get_source().num();
+
+ // dirs
+ for (list<MMDSCacheRejoinAck::dirinfo>::iterator p = m->dirs.begin();
+ p != m->dirs.end();
+ ++p) {
+ CInode *diri = get_inode(p->dirino);
+ CDir *dir = diri->dir;
+ assert(dir);
+
+ dir->set_replica_nonce(p->nonce);
+ dout(10) << " got " << *dir << endl;
+
+ // dentries
+ for (map<string,MMDSCacheRejoinAck::dninfo>::iterator q = m->dentries[p->dirino].begin();
+ q != m->dentries[p->dirino].end();
+ ++q) {
+ CDentry *dn = dir->lookup(q->first);
+ assert(dn);
+ dn->set_replica_nonce(q->second.nonce);
+ dn->set_lockstate(q->second.lock);
+ dout(10) << " got " << *dn << endl;
+ }
+ }
+
+ // inodes
+ for (list<MMDSCacheRejoinAck::inodeinfo>::iterator p = m->inodes.begin();
+ p != m->inodes.end();
+ ++p) {
+ CInode *in = get_inode(p->ino);
+ assert(in);
+ in->set_replica_nonce(p->nonce);
+ in->hardlock.set_state(p->hardlock);
+ in->filelock.set_state(p->filelock);
+ dout(10) << " got " << *in << endl;
+ }
+
+ delete m;
+
+ // done?
+ rejoin_ack_gather.erase(from);
+ if (rejoin_ack_gather.empty()) {
+ dout(7) << "all done, going active!" << endl;
+ show_imports();
+ show_cache();
+ mds->set_want_state(MDSMap::STATE_ACTIVE);
+ } else {
+ dout(7) << "still need rejoin_ack from " << rejoin_ack_gather << endl;
+ }
+
+}
+
+
+
+
+
+// ===============================================================================
void MDCache::rename_file(CDentry *srcdn,
CDentry *destdn)
{
assert(root == 0);
root = in;
- root->state_set(CINODE_STATE_ROOT);
+ root->state_set(CInode::STATE_ROOT);
}
void MDCache::add_import(CDir *dir)
{
imports.insert(dir);
dir->state_set(CDIR_STATE_IMPORT);
- dir->get(CDIR_PIN_IMPORT);
+ dir->get(CDir::PIN_IMPORT);
+}
+
+
+void MDCache::recalc_auth_bits()
+{
+ dout(7) << "recalc_auth_bits" << endl;
+
+ for (hash_map<inodeno_t,CInode*>::iterator p = inode_map.begin();
+ p != inode_map.end();
+ ++p) {
+ CInode *in = p->second;
+ if (in->authority() == mds->get_nodeid())
+ in->state_set(CInode::STATE_AUTH);
+ else {
+ in->state_clear(CInode::STATE_AUTH);
+ if (in->is_dirty())
+ in->mark_clean();
+ }
+
+ if (in->parent) {
+ if (in->parent->authority() == mds->get_nodeid())
+ in->parent->state_set(CDentry::STATE_AUTH);
+ else {
+ in->parent->state_clear(CDentry::STATE_AUTH);
+ if (in->parent->is_dirty())
+ in->parent->mark_clean();
+ }
+ }
+
+ if (in->dir) {
+ if (in->dir->authority() == mds->get_nodeid())
+ in->dir->state_set(CDIR_STATE_AUTH);
+ else {
+ in->dir->state_clear(CDIR_STATE_AUTH);
+ if (in->dir->is_dirty())
+ in->dir->mark_clean();
+ }
+ }
+ }
+ show_imports();
+ show_cache();
}
-
bool MDCache::trim(int max)
{
- // empty? short cut.
- if (lru.lru_get_size() == 0) return true;
-
+ // trim LRU
if (max < 0) {
max = lru.lru_get_max();
if (!max) return false;
}
+ dout(7) << "trim max=" << max << " cur=" << lru.lru_get_size() << endl;
map<int, MCacheExpire*> expiremap;
- dout(7) << "trim max=" << max << " cur=" << lru.lru_get_size() << endl;
- assert(expiremap.empty());
-
while (lru.lru_get_size() > (unsigned)max) {
- CInode *in = (CInode*)lru.lru_expire();
- if (!in) break; //return false;
+ CDentry *dn = (CDentry*)lru.lru_expire();
+ if (!dn) break;
+
+ CDir *dir = dn->get_dir();
+ assert(dir);
+
+ // notify dentry authority?
+ if (!dn->is_auth()) {
+ int auth = dn->authority();
+ dout(17) << "sending expire to mds" << auth << " on " << *dn << endl;
+ if (expiremap.count(auth) == 0)
+ expiremap[auth] = new MCacheExpire(mds->get_nodeid());
+ expiremap[auth]->add_dentry(dir->ino(), dn->get_name(), dn->get_replica_nonce());
+ }
+
+ // unlink the dentry
+ dout(15) << "trim removing " << *dn << endl;
+ if (!dn->is_null())
+ dir->unlink_inode(dn);
+ dir->remove_dentry(dn);
+
+ // adjust the dir state
+ CInode *diri = dir->get_inode();
+ diri->dir->state_clear(CDIR_STATE_COMPLETE); // dir incomplete!
+
+ // reexport?
+ if (diri->dir->is_import() && // import
+ diri->dir->get_size() == 0 && // no children
+ !diri->is_root()) // not root
+ migrator->export_empty_import(diri->dir);
+
+ if (mds->logger) mds->logger->inc("cex");
+ }
+
+ // inode expire_queue
+ while (!inode_expire_queue.empty()) {
+ CInode *in = inode_expire_queue.front();
+ inode_expire_queue.pop_front();
+ assert(in->get_num_ref() == 0);
+
+ int dirauth = -2;
if (in->dir) {
// notify dir authority?
- int auth = in->dir->authority();
- if (auth != mds->get_nodeid()) {
- dout(17) << "sending expire to mds" << auth << " on " << *in->dir << endl;
- if (expiremap.count(auth) == 0) expiremap[auth] = new MCacheExpire(mds->get_nodeid());
- expiremap[auth]->add_dir(in->ino(), in->dir->replica_nonce);
+ dirauth = in->dir->authority();
+ if (dirauth != mds->get_nodeid()) {
+ dout(17) << "sending expire to mds" << dirauth << " on " << *in->dir << endl;
+ if (expiremap.count(dirauth) == 0)
+ expiremap[dirauth] = new MCacheExpire(mds->get_nodeid());
+ expiremap[dirauth]->add_dir(in->ino(), in->dir->replica_nonce);
}
- }
- // notify inode authority?
- {
- int auth = in->authority();
- if (auth != mds->get_nodeid()) {
- assert(!in->is_auth());
- dout(17) << "sending expire to mds" << auth << " on " << *in << endl;
- if (expiremap.count(auth) == 0) expiremap[auth] = new MCacheExpire(mds->get_nodeid());
- expiremap[auth]->add_inode(in->ino(), in->replica_nonce);
- } else {
- assert(in->is_auth());
- }
+ in->close_dir();
}
- CInode *diri = NULL;
- if (in->parent)
- diri = in->parent->dir->inode;
-
- if (in->is_root()) {
- dout(7) << "just trimmed root, cache now empty." << endl;
- root = NULL;
+
+ // notify inode authority
+ int auth = in->authority();
+ if (auth == CDIR_AUTH_UNKNOWN) {
+ assert(in->ino() == 1);
+ assert(dirauth >= 0);
+ auth = dirauth;
}
-
-
- // last link?
- if (in->inode.nlink == 0) {
- dout(17) << "last link, removing file content " << *in << endl; // FIXME THIS IS WRONG PLACE FOR THIS!
- mds->filer->zero(in->inode,
- 0, in->inode.size,
- NULL, NULL); // FIXME
+ if (auth != mds->get_nodeid()) {
+ assert(!in->is_auth());
+ dout(17) << "sending expire to mds" << auth << " on " << *in << endl;
+ if (expiremap.count(auth) == 0)
+ expiremap[auth] = new MCacheExpire(mds->get_nodeid());
+ expiremap[auth]->add_inode(in->ino(), in->get_replica_nonce());
+ } else {
+ assert(in->is_auth());
}
- // remove it
- dout(15) << "trim removing " << *in << " " << in << endl;
+ dout(15) << "trim removing " << *in << endl;
+ if (in == root) root = 0;
remove_inode(in);
- delete in;
-
- if (diri) {
- // dir incomplete!
- diri->dir->state_clear(CDIR_STATE_COMPLETE);
-
- // reexport?
- if (diri->dir->is_import() && // import
- diri->dir->get_size() == 0 && // no children
- !diri->is_root()) // not root
- migrator->export_empty_import(diri->dir);
-
- }
-
- if (mds->logger) mds->logger->inc("cex");
}
-
- /* hack
- if (lru.lru_get_size() == max) {
- int i;
- dout(1) << "lru_top " << lru.lru_ntop << "/" << lru.lru_num << endl;
- CInode *cur = (CInode*)lru.lru_tophead;
- i = 1;
- while (cur) {
- dout(1) << " top " << i++ << "/" << lru.lru_ntop << " " << cur->lru_is_expireable() << " " << *cur << endl;
- cur = (CInode*)cur->lru_next;
- }
-
- dout(1) << "lru_bot " << lru.lru_nbot << "/" << lru.lru_num << endl;
- cur = (CInode*)lru.lru_bothead;
- i = 1;
- while (cur) {
- dout(1) << " bot " << i++ << "/" << lru.lru_nbot << " " << cur->lru_is_expireable() << " " << *cur << endl;
- cur = (CInode*)cur->lru_next;
- }
-
- }
- */
-
// send expires
for (map<int, MCacheExpire*>::iterator it = expiremap.begin();
it != expiremap.end();
return true;
}
+
+void MDCache::trim_non_auth()
+{
+ dout(7) << "trim_non_auth" << endl;
+
+ CDentry *first_auth = 0;
+
+ // trim non-auth items from the lru
+ while (lru.lru_get_size() > 0) {
+ CDentry *dn = (CDentry*)lru.lru_expire();
+ if (!dn) break;
+
+ if (dn->is_auth()) {
+ // add back into lru (at the top)
+ lru.lru_insert_top(dn);
+
+ if (!first_auth) {
+ first_auth = dn;
+ } else {
+ if (first_auth == dn)
+ break;
+ }
+ } else {
+ // non-auth. expire.
+ CDir *dir = dn->get_dir();
+ assert(dir);
+
+ // unlink the dentry
+ dout(15) << "trim_non_auth removing " << *dn << endl;
+ if (!dn->is_null())
+ dir->unlink_inode(dn);
+ dir->remove_dentry(dn);
+
+ // adjust the dir state
+ CInode *diri = dir->get_inode();
+ diri->dir->state_clear(CDIR_STATE_COMPLETE); // dir incomplete!
+ }
+ }
+
+ // inode expire queue
+ while (!inode_expire_queue.empty()) {
+ CInode *in = inode_expire_queue.front();
+ inode_expire_queue.pop_front();
+ dout(15) << "trim_non_auth removing " << *in << endl;
+ if (in == root) root = 0;
+ remove_inode(in);
+ }
+}
+
+
+
class C_MDC_ShutdownCommit : public Context {
MDCache *mdc;
public:
class C_MDC_ShutdownCheck : public Context {
MDCache *mdc;
- Mutex *lock;
public:
- C_MDC_ShutdownCheck(MDCache *m, Mutex *l) : mdc(m), lock(l) {}
+ C_MDC_ShutdownCheck(MDCache *m) : mdc(m) {}
void finish(int) {
- lock->Lock();
mdc->shutdown_check();
- lock->Unlock();
}
};
g_conf.debug_mds = 10;
show_cache();
g_conf.debug_mds = o;
- g_timer.add_event_after(g_conf.mds_shutdown_check, new C_MDC_ShutdownCheck(this, &mds->mds_lock));
+ mds->timer.add_event_after(g_conf.mds_shutdown_check, new C_MDC_ShutdownCheck(this));
// this
dout(0) << "lru size now " << lru.lru_get_size() << endl;
dout(1) << "shutdown_start" << endl;
if (g_conf.mds_shutdown_check)
- g_timer.add_event_after(g_conf.mds_shutdown_check, new C_MDC_ShutdownCheck(this, &mds->mds_lock));
+ mds->timer.add_event_after(g_conf.mds_shutdown_check, new C_MDC_ShutdownCheck(this));
}
{
dout(7) << "shutdown_pass" << endl;
//assert(mds->is_shutting_down());
- if (mds->is_stopped()) {
+ if (mds->is_out()) {
dout(7) << " already shut down" << endl;
show_cache();
show_imports();
// flush anything we can from the cache
trim(0);
- dout(5) << "cache size now " << lru.lru_get_size() << endl;
+ dout(5) << "lru size now " << lru.lru_get_size() << endl;
+ mds->mdlog->trim(0);
// (wait for) flush log?
- if (g_conf.mds_log_flush_on_shutdown &&
- mds->mdlog->get_num_events()) {
- dout(7) << "waiting for log to flush .. " << mds->mdlog->get_num_events() << endl;
- return false;
- }
-
+ if (g_conf.mds_log_flush_on_shutdown) {
+ if (mds->mdlog->get_non_importmap_events()) {
+ dout(7) << "waiting for log to flush .. " << mds->mdlog->get_num_events()
+ << " (" << mds->mdlog->get_non_importmap_events() << ")" << endl;
+ return false;
+ }
+ }
+
+
// send all imports back to 0.
if (mds->get_nodeid() != 0 && !did_shutdown_exports) {
// flush what i can from the cache first..
return false;
}
- // filer active?
- if (mds->filer->is_active()) {
- dout(7) << "filer still active" << endl;
- return false;
- }
// close root?
if (mds->get_nodeid() == 0 &&
- lru.lru_get_size() == 1 &&
+ lru.lru_get_size() == 0 &&
root &&
root->dir &&
root->dir->is_import() &&
- root->dir->get_ref() == 1) { // 1 is the import!
+ root->dir->get_num_ref() == 1) { // 1 is the import!
// un-import
dout(7) << "removing root import" << endl;
imports.erase(root->dir);
root->dir->state_clear(CDIR_STATE_IMPORT);
- root->dir->put(CDIR_PIN_IMPORT);
+ root->dir->put(CDir::PIN_IMPORT);
- if (root->is_pinned_by(CINODE_PIN_DIRTY)) {
- dout(7) << "clearing root dirty flag" << endl;
- root->put(CINODE_PIN_DIRTY);
+ if (root->is_pinned_by(CInode::PIN_DIRTY)) {
+ dout(7) << "clearing root inode dirty flag" << endl;
+ root->put(CInode::PIN_DIRTY);
}
trim(0);
- assert(inode_map.size() == lru.lru_get_size());
}
// imports?
- if (!imports.empty()) {
- dout(7) << "still have " << imports.size() << " imports" << endl;
+ if (!imports.empty() || migrator->is_exporting()) {
+ dout(7) << "still have " << imports.size() << " imports, or still exporting" << endl;
show_cache();
return false;
}
+ // cap log?
+ if (g_conf.mds_log_flush_on_shutdown) {
+
+ if (imports.empty() && exports.empty()) {
+ // (only do this once!)
+ if (!mds->mdlog->is_capped()) {
+ dout(7) << "capping the log" << endl;
+ mds->mdlog->cap();
+ // note that this won't flush right away, so we'll make at least one more pass
+ }
+ }
+
+ if (mds->mdlog->get_num_events()) {
+ dout(7) << "waiting for log to flush (including import_map, now) .. " << mds->mdlog->get_num_events()
+ << " (" << mds->mdlog->get_non_importmap_events() << ")" << endl;
+ return false;
+ }
+
+ if (!did_shutdown_log_cap) {
+ // flush journal header
+ dout(7) << "writing header for (now-empty) journal" << endl;
+ assert(mds->mdlog->empty());
+ mds->mdlog->write_head(0);
+ // NOTE: filer active checker below will block us until this completes.
+ did_shutdown_log_cap = true;
+ return false;
+ }
+ }
+
+ // filer active?
+ if (mds->filer->is_active()) {
+ dout(7) << "filer still active" << endl;
+ return false;
+ }
+
+
// done?
if (lru.lru_get_size() > 0) {
dout(7) << "there's still stuff in the cache: " << lru.lru_get_size() << endl;
+CInode *MDCache::create_root_inode()
+{
+ CInode *root = new CInode(this);
+ memset(&root->inode, 0, sizeof(inode_t));
+ root->inode.ino = 1;
+ root->inode.hash_seed = 0; // not hashed!
+
+ // make it up (FIXME)
+ root->inode.mode = 0755 | INODE_MODE_DIR;
+ root->inode.size = 0;
+ root->inode.ctime = 0;
+ root->inode.mtime = g_clock.gettime();
+
+ root->inode.nlink = 1;
+ root->inode.layout = g_OSD_MDDirLayout;
+
+ set_root( root );
+ add_inode( root );
+
+ return root;
+}
int MDCache::open_root(Context *c)
// open root inode
if (whoami == 0) {
// i am root inode
- CInode *root = new CInode(this);
- memset(&root->inode, 0, sizeof(inode_t));
- root->inode.ino = 1;
- root->inode.hash_seed = 0; // not hashed!
-
- // make it up (FIXME)
- root->inode.mode = 0755 | INODE_MODE_DIR;
- root->inode.size = 0;
- root->inode.ctime = 0;
- root->inode.mtime = g_clock.gettime();
-
- root->inode.nlink = 1;
- root->inode.layout = g_OSD_MDDirLayout;
-
- set_root( root );
- add_inode( root );
+ CInode *root = create_root_inode();
// root directory too
assert(root->dir == NULL);
- root->set_dir( new CDir(root, mds, true) );
+ root->set_dir( new CDir(root, this, true) );
root->dir->set_dir_auth( 0 ); // me!
root->dir->dir_rep = CDIR_REP_ALL; //NONE;
// root is sort of technically an import (from a vacuum)
imports.insert( root->dir );
root->dir->state_set(CDIR_STATE_IMPORT);
- root->dir->get(CDIR_PIN_IMPORT);
+ root->dir->get(CDir::PIN_IMPORT);
if (c) {
c->finish(0);
void MDCache::dispatch(Message *m)
{
switch (m->get_type()) {
+
+ case MSG_MDS_IMPORTMAP:
+ handle_import_map((MMDSImportMap*)m);
+ break;
+
+ case MSG_MDS_CACHEREJOIN:
+ handle_cache_rejoin((MMDSCacheRejoin*)m);
+ break;
+ case MSG_MDS_CACHEREJOINACK:
+ handle_cache_rejoin_ack((MMDSCacheRejoinAck*)m);
+ break;
+
+
case MSG_MDS_DISCOVER:
handle_discover((MDiscover*)m);
break;
return 1;
}
- cur->get_or_open_dir(mds);
+ cur->get_or_open_dir(this);
assert(cur->dir);
} else {
// discover dir from/via inode auth
if (((MClientRequest*)req)->get_mds_wants_replica_in_dirino() == cur->dir->ino() &&
cur->dir->is_auth() &&
cur->dir->is_rep() &&
- cur->dir->is_open_by(req->get_source().num()) &&
+ cur->dir->is_replica(req->get_source().num()) &&
dn->get_inode()->is_auth()
) {
assert(req->get_source().is_mds());
int from = req->get_source().num();
- if (dn->get_inode()->is_cached_by(from)) {
+ if (dn->get_inode()->is_replica(from)) {
dout(15) << "traverse: REP would replicate to mds" << from << ", but already cached_by "
<< req->get_source() << " dn " << *dn << endl;
} else {
dout(10) << "traverse: REP replicating to " << req->get_source() << " dn " << *dn << endl;
MDiscoverReply *reply = new MDiscoverReply(cur->dir->ino());
- reply->add_dentry( dn->get_name(), !dn->can_read());
+ reply->add_dentry( dn->replicate_to( from ) );
reply->add_inode( dn->inode->replicate_to( from ) );
mds->send_message_mds(reply, req->get_source().num(), MDS_PORT_CACHE);
}
dout(11) << "path_unpinned " << *dn << endl;
// did we completely unpin a waiter?
- if (dn->lockstate == DN_LOCK_UNPINNING && !dn->is_pinned()) {
+ if (dn->lockstate == DN_LOCK_UNPINNING && !dn->get_num_ref()) {
// return state to sync, in case the unpinner flails
dn->lockstate = DN_LOCK_SYNC;
if (g_conf.log_pins) {
// pin
- for (int i=0; i<CINODE_NUM_PINS; i++) {
+ /*
+for (int i=0; i<CINODE_NUM_PINS; i++) {
if (mds->logger2) mds->logger2->set(cinode_pin_names[i],
cinode_pins[i]);
}
+ */
/*
for (map<int,int>::iterator it = cdir_pins.begin();
it != cdir_pins.end();
assert(in->inode.anchored == false);
in->inode.anchored = true;
- in->state_clear(CINODE_STATE_ANCHORING);
- in->put(CINODE_PIN_ANCHORING);
+ in->state_clear(CInode::STATE_ANCHORING);
+ in->put(CInode::PIN_ANCHORING);
- in->mark_dirty();
+ in->_mark_dirty(); // fixme
}
// trigger
assert(in->is_auth());
// already anchoring?
- if (in->state_test(CINODE_STATE_ANCHORING)) {
+ if (in->state_test(CInode::STATE_ANCHORING)) {
dout(7) << "anchor_inode already anchoring " << *in << endl;
// wait
dout(7) << "anchor_inode anchoring " << *in << endl;
// auth: do it
- in->state_set(CINODE_STATE_ANCHORING);
- in->get(CINODE_PIN_ANCHORING);
+ in->state_set(CInode::STATE_ANCHORING);
+ in->get(CInode::PIN_ANCHORING);
// wait
in->add_waiter(CINODE_WAIT_ANCHORED,
}
in->inode.nlink++;
- in->mark_dirty();
+ in->_mark_dirty(); // fixme
// reply
dout(7) << " nlink++, now " << in->inode.nlink++ << endl;
}
if (!cur->dir)
- cur->get_or_open_dir(mds);
+ cur->get_or_open_dir(this);
assert(cur->dir);
dout(10) << "dir is " << *cur->dir << endl;
break;
}
- if (!cur->dir) cur->get_or_open_dir(mds);
+ if (!cur->dir) cur->get_or_open_dir(this);
reply->add_dir( new CDirDiscover( cur->dir,
- cur->dir->open_by_add( dis->get_asker() ) ) );
+ cur->dir->add_replica( dis->get_asker() ) ) );
dout(7) << "added dir " << *cur->dir << endl;
}
if (dis->get_want().depth() == 0) break;
break; // don't replicate null but non-locked dentries.
}
- reply->add_dentry( dis->get_dentry(i), !dn->can_read() );
+ reply->add_dentry( dn->replicate_to( dis->get_asker() ) );
dout(7) << "added dentry " << *dn << endl;
if (!dn->inode) break; // we're done.
dout2(7) << ", now " << *cur->dir << endl;
} else {
// add it (_replica_)
- cur->set_dir( new CDir(cur, mds, false) );
+ cur->set_dir( new CDir(cur, this, false) );
m->get_dir(i).update_dir(cur->dir);
dout(7) << "added " << *cur->dir << " nonce " << cur->dir->replica_nonce << endl;
if (i >= m->get_num_dentries()) break;
// dentry
- dout(7) << "i = " << i << " dentry is " << m->get_dentry(i) << endl;
+ dout(7) << "i = " << i << " dentry is " << m->get_dentry(i).get_dname() << endl;
CDentry *dn = 0;
if (i > 0 ||
m->has_base_dentry()) {
- dn = cur->dir->lookup( m->get_dentry(i) );
+ dn = cur->dir->lookup( m->get_dentry(i).get_dname() );
if (dn) {
dout(7) << "had " << *dn << endl;
+ dn->replica_nonce = m->get_dentry(i).get_nonce(); // fix nonce.
} else {
- dn = cur->dir->add_dentry( m->get_dentry(i) );
- if (m->get_dentry_xlock(i)) {
- dout(7) << " new dentry is xlock " << *dn << endl;
- dn->lockstate = DN_LOCK_XLOCK;
- dn->xlockedby = 0;
- }
+ dn = cur->dir->add_dentry( m->get_dentry(i).get_dname(), 0, false );
+ m->get_dentry(i).update_dentry(dn);
dout(7) << "added " << *dn << endl;
}
cur->dir->take_waiting(CDIR_WAIT_DENTRY,
- m->get_dentry(i),
+ m->get_dentry(i).get_dname(),
finished);
}
map<int, MCacheExpire*> proxymap;
if (m->get_from() == source) {
- dout(7) << "cache_expire from " << from << endl;
+ dout(7) << "cache_expire from mds" << from << endl;
} else {
- dout(7) << "cache_expire from " << from << " via " << source << endl;
+ dout(7) << "cache_expire from mds" << from << " via " << source << endl;
}
// inodes
int nonce = it->second;
if (!in) {
- dout(0) << "inode_expire on " << it->first << " from " << from << ", don't have it" << endl;
+ dout(0) << "inode expire on " << it->first << " from " << from << ", don't have it" << endl;
assert(in); // i should be authority, or proxy .. and pinned
}
if (!in->is_auth()) {
int newauth = in->authority();
dout(7) << "proxy inode expire on " << *in << " to " << newauth << endl;
assert(newauth >= 0);
- if (!in->state_test(CINODE_STATE_PROXY)) dout(0) << "missing proxy bit on " << *in << endl;
- assert(in->state_test(CINODE_STATE_PROXY));
+ if (!in->state_test(CInode::STATE_PROXY)) dout(0) << "missing proxy bit on " << *in << endl;
+ assert(in->state_test(CInode::STATE_PROXY));
if (proxymap.count(newauth) == 0) proxymap[newauth] = new MCacheExpire(from);
proxymap[newauth]->add_inode(it->first, it->second);
continue;
if (from == mds->get_nodeid()) {
// my cache_expire, and the export_dir giving auth back to me crossed paths!
// we can ignore this. no danger of confusion since the two parties are both me.
- dout(7) << "inode_expire on " << *in << " from mds" << from << " .. ME! ignoring." << endl;
+ dout(7) << "inode expire on " << *in << " from mds" << from << " .. ME! ignoring." << endl;
}
- else if (nonce == in->get_cached_by_nonce(from)) {
+ else if (nonce == in->get_replica_nonce(from)) {
// remove from our cached_by
- dout(7) << "inode_expire on " << *in << " from mds" << from << " cached_by was " << in->cached_by << endl;
- in->cached_by_remove(from);
- in->mds_caps_wanted.erase(from);
-
- // note: this code calls _eval more often than it needs to!
- // fix lock
- if (in->hardlock.is_gathering(from)) {
- in->hardlock.gather_set.erase(from);
- if (in->hardlock.gather_set.size() == 0)
- mds->locker->inode_hard_eval(in);
- }
- if (in->filelock.is_gathering(from)) {
- in->filelock.gather_set.erase(from);
- if (in->filelock.gather_set.size() == 0)
- mds->locker->inode_file_eval(in);
- }
-
- // alone now?
- if (!in->is_cached_by_anyone()) {
- mds->locker->inode_hard_eval(in);
- mds->locker->inode_file_eval(in);
- }
+ dout(7) << "inode expire on " << *in << " from mds" << from << " cached_by was " << in->get_replicas() << endl;
+ inode_remove_replica(in, from);
}
else {
// this is an old nonce, ignore expire.
- dout(7) << "inode_expire on " << *in << " from mds" << from << " with old nonce " << nonce << " (current " << in->get_cached_by_nonce(from) << "), dropping" << endl;
- assert(in->get_cached_by_nonce(from) > nonce);
+ dout(7) << "inode expire on " << *in << " from mds" << from
+ << " with old nonce " << nonce << " (current " << in->get_replica_nonce(from) << "), dropping"
+ << endl;
+ assert(in->get_replica_nonce(from) > nonce);
}
}
it != m->get_dirs().end();
it++) {
CInode *diri = get_inode(it->first);
+ assert(diri);
CDir *dir = diri->dir;
int nonce = it->second;
if (!dir) {
- dout(0) << "dir_expire on " << it->first << " from " << from << ", don't have it" << endl;
+ dout(0) << "dir expire on " << it->first << " from " << from << ", don't have it" << endl;
assert(dir); // i should be authority, or proxy ... and pinned
}
if (!dir->is_auth()) {
// check nonce
if (from == mds->get_nodeid()) {
- dout(7) << "dir_expire on " << *dir << " from mds" << from << " .. ME! ignoring" << endl;
+ dout(7) << "dir expire on " << *dir << " from mds" << from
+ << " .. ME! ignoring" << endl;
}
- else if (nonce == dir->get_open_by_nonce(from)) {
+ else if (nonce == dir->get_replica_nonce(from)) {
// remove from our cached_by
- dout(7) << "dir_expire on " << *dir << " from mds" << from << " open_by was " << dir->open_by << endl;
- dir->open_by_remove(from);
+ dout(7) << "dir expire on " << *dir << " from mds" << from
+ << " replicas was " << dir->replicas << endl;
+ dir->remove_replica(from);
}
else {
// this is an old nonce, ignore expire.
- dout(7) << "dir_expire on " << *dir << " from mds" << from << " with old nonce " << nonce << " (current " << dir->get_open_by_nonce(from) << "), dropping" << endl;
- assert(dir->get_open_by_nonce(from) > nonce);
+ dout(7) << "dir expire on " << *dir << " from mds" << from
+ << " with old nonce " << nonce << " (current " << dir->get_replica_nonce(from)
+ << "), dropping" << endl;
+ assert(dir->get_replica_nonce(from) > nonce);
+ }
+ }
+
+ // dentries
+ for (map<inodeno_t, map<string,int> >::iterator pd = m->get_dentries().begin();
+ pd != m->get_dentries().end();
+ ++pd) {
+ dout(0) << "dn expires in dir " << pd->first << endl;
+ CInode *diri = get_inode(pd->first);
+ CDir *dir = diri->dir;
+ assert(dir);
+
+ if (!dir->is_auth()) {
+ int newauth = dir->authority();
+ dout(7) << "proxy dentry expires on " << *dir << " to " << newauth << endl;
+ if (!dir->is_proxy())
+ dout(0) << "nonproxy dentry expires? " << *dir << " .. auth is " << newauth
+ << " .. expire is from " << from << endl;
+ assert(dir->is_proxy());
+ assert(newauth >= 0);
+ assert(dir->state_test(CDIR_STATE_PROXY));
+ if (proxymap.count(newauth) == 0) proxymap[newauth] = new MCacheExpire(from);
+ proxymap[newauth]->add_dentries(pd->first, pd->second);
+ continue;
+ }
+
+ for (map<string,int>::iterator p = pd->second.begin();
+ p != pd->second.end();
+ ++p) {
+ int nonce = p->second;
+
+ CDentry *dn = dir->lookup(p->first);
+ if (!dn)
+ dout(0) << "missing dentry for " << p->first << " in " << *dir << endl;
+ assert(dn);
+
+ if (from == mds->get_nodeid()) {
+ dout(7) << "dentry_expire on " << *dn << " from mds" << from
+ << " .. ME! ignoring" << endl;
+ }
+ else if (nonce == dn->get_replica_nonce(from)) {
+ dout(7) << "dentry_expire on " << *dn << " from mds" << from << endl;
+ dn->remove_replica(from);
+ }
+ else {
+ dout(7) << "dentry_expire on " << *dn << " from mds" << from
+ << " with old nonce " << nonce << " (current " << dn->get_replica_nonce(from)
+ << "), dropping" << endl;
+ assert(dn->get_replica_nonce(from) > nonce);
+ }
}
}
delete m;
}
+void MDCache::inode_remove_replica(CInode *in, int from)
+{
+ in->remove_replica(from);
+ in->mds_caps_wanted.erase(from);
+
+ // note: this code calls _eval more often than it needs to!
+ // fix lock
+ if (in->hardlock.is_gathering(from)) {
+ in->hardlock.gather_set.erase(from);
+ if (in->hardlock.gather_set.size() == 0)
+ mds->locker->inode_hard_eval(in);
+ }
+ if (in->filelock.is_gathering(from)) {
+ in->filelock.gather_set.erase(from);
+ if (in->filelock.gather_set.size() == 0)
+ mds->locker->inode_file_eval(in);
+ }
+
+ // alone now?
+ if (!in->is_replicated()) {
+ mds->locker->inode_hard_eval(in);
+ mds->locker->inode_file_eval(in);
+ }
+}
int MDCache::send_dir_updates(CDir *dir, bool bcast)
{
// this is an FYI, re: replication
- set<int> who = dir->open_by;
- if (bcast)
- who = mds->get_mds_map()->get_mds();
+ set<int> who;
+ if (bcast) {
+ mds->get_mds_map()->get_active_mds_set(who);
+ } else {
+ for (map<int,int>::iterator p = dir->replicas_begin();
+ p != dir->replicas_end();
+ ++p)
+ who.insert(p->first);
+ }
dout(7) << "sending dir_update on " << *dir << " bcast " << bcast << " to " << who << endl;
// log it
if (dn->inode) dn->inode->mark_unsafe(); // XXX ??? FIXME
- mds->mdlog->submit_entry(new EUnlink(dir, dn, dn->inode),
+ mds->mdlog->submit_entry(new EString("unlink fixme fixme"),//EUnlink(dir, dn, dn->inode),
NULL); // FIXME FIXME FIXME
// tell replicas
- if (dir->is_open_by_anyone()) {
- for (set<int>::iterator it = dir->open_by_begin();
- it != dir->open_by_end();
+ if (dir->is_replicated()) {
+ for (map<int,int>::iterator it = dir->replicas_begin();
+ it != dir->replicas_end();
it++) {
- dout(7) << "inode_unlink sending DentryUnlink to " << *it << endl;
+ dout(7) << "inode_unlink sending DentryUnlink to mds" << it->first << endl;
- mds->send_message_mds(new MDentryUnlink(dir->ino(), dn->name), *it, MDS_PORT_CACHE);
+ mds->send_message_mds(new MDentryUnlink(dir->ino(), dn->name), it->first, MDS_PORT_CACHE);
}
// don't need ack.
// unlink locally
CInode *in = dn->inode;
dn->dir->unlink_inode( dn );
- dn->mark_dirty();
+ dn->_mark_dirty(); // fixme
// mark it dirty!
- in->mark_dirty();
+ in->_mark_dirty(); // fixme
// update anchor to point to inode file+mds
vector<Anchor*> atrace;
// awesome, i can do it
dout(7) << "remote target is local, nlink--" << endl;
dn->inode->inode.nlink--;
- dn->inode->mark_dirty();
+ dn->inode->_mark_dirty(); // fixme
- if (( dn->inode->state_test(CINODE_STATE_DANGLING) && dn->inode->inode.nlink == 0) ||
- (!dn->inode->state_test(CINODE_STATE_DANGLING) && dn->inode->inode.nlink == 1)) {
+ if (( dn->inode->state_test(CInode::STATE_DANGLING) && dn->inode->inode.nlink == 0) ||
+ (!dn->inode->state_test(CInode::STATE_DANGLING) && dn->inode->inode.nlink == 1)) {
dout(7) << "nlink=1+primary or 0+dangling, removing anchor" << endl;
// remove anchor (async)
// unlink locally
CInode *in = dn->inode;
dn->dir->unlink_inode( dn );
- dn->mark_dirty();
+ dn->_mark_dirty(); // fixme
// add waiter
in->add_waiter(CINODE_WAIT_UNLINK, c);
// unlink locally
dn->dir->unlink_inode( dn );
- dn->mark_dirty();
+ dn->_mark_dirty(); // fixme
// finish!
dentry_unlink_finish(dn, dir, c);
assert(in->inode.nlink > 0);
in->inode.nlink--;
- if (in->state_test(CINODE_STATE_DANGLING)) {
+ if (in->state_test(CInode::STATE_DANGLING)) {
// already dangling.
// last link?
if (in->inode.nlink == 0) {
mds->anchorclient->destroy(in->ino(), NULL);
}
else {
- in->mark_dirty();
+ in->_mark_dirty(); // fixme
}
} else {
// has primary link still.
assert(in->inode.nlink >= 1);
- in->mark_dirty();
+ in->_mark_dirty(); // fixme
if (in->inode.nlink == 1) {
dout(7) << "nlink=1, removing anchor" << endl;
* Returns the directory in which authority is delegated for *dir.
* This may be because a directory is an import, or because it is hashed
* and we are nested underneath an inode in that dir (that hashes to us).
- * Thus do not assume con->is_auth()! It is_auth() || is_hashed().
+ * Thus do not assume result->is_auth()! It is_auth() || is_hashed().
*/
CDir *MDCache::get_auth_container(CDir *dir)
{
while (true) {
if (imp->is_import()) break; // import
imp = imp->get_parent_dir();
- assert(imp);
+ if (!imp) break; // none
if (imp->is_hashed()) break; // hash
}
return imp;
}
+CDir *MDCache::get_export_container(CDir *dir)
+{
+ CDir *ex = dir; // might be *dir
+ assert(!ex->is_auth());
+
+ // find the underlying import or hash that delegates dir away
+ while (true) {
+ if (ex->is_export()) break; // import
+ ex = ex->get_parent_dir();
+ assert(ex);
+ if (ex->is_hashed()) break; // hash
+ }
+
+ return ex;
+}
+
void MDCache::find_nested_exports(CDir *dir, set<CDir*>& s)
{
void MDCache::show_imports()
{
- mds->balancer->show_imports();
+ int db = 10;
+
+ if (imports.empty() &&
+ hashdirs.empty()) {
+ dout(db) << "show_imports: no imports/exports/hashdirs" << endl;
+ return;
+ }
+ dout(db) << "show_imports:" << endl;
+
+ set<CDir*> ecopy = exports;
+
+ set<CDir*>::iterator it = hashdirs.begin();
+ while (1) {
+ if (it == hashdirs.end()) it = imports.begin();
+ if (it == imports.end() ) break;
+
+ CDir *im = *it;
+
+ if (im->is_import()) {
+ dout(db) << " + import (" << im->popularity[MDS_POP_CURDOM] << "/" << im->popularity[MDS_POP_ANYDOM] << ") " << *im << endl;
+ //assert( im->is_auth() );
+ }
+ else if (im->is_hashed()) {
+ if (im->is_import()) continue; // if import AND hash, list as import.
+ dout(db) << " + hash (" << im->popularity[MDS_POP_CURDOM] << "/" << im->popularity[MDS_POP_ANYDOM] << ") " << *im << endl;
+ }
+
+ for (set<CDir*>::iterator p = nested_exports[im].begin();
+ p != nested_exports[im].end();
+ p++) {
+ CDir *exp = *p;
+ if (exp->is_hashed()) {
+ //assert(0); // we don't do it this way actually
+ dout(db) << " - hash (" << exp->popularity[MDS_POP_NESTED] << ", " << exp->popularity[MDS_POP_ANYDOM] << ") " << *exp << " to " << exp->dir_auth << endl;
+ //assert( !exp->is_auth() );
+ } else {
+ dout(db) << " - ex (" << exp->popularity[MDS_POP_NESTED] << ", " << exp->popularity[MDS_POP_ANYDOM] << ") " << *exp << " to " << exp->dir_auth << endl;
+ assert( exp->is_export() );
+ //assert( !exp->is_auth() );
+ }
+
+ if ( get_auth_container(exp) != im ) {
+ dout(1) << "uh oh, auth container is " << *get_auth_container(exp) << endl;
+ assert( get_auth_container(exp) == im );
+ }
+
+ if (ecopy.count(exp) != 1) {
+ dout(1) << "***** nested_export " << *exp << " not in exports" << endl;
+ assert(0);
+ }
+ ecopy.erase(exp);
+ }
+
+ it++;
+ }
+
+ if (ecopy.size()) {
+ for (set<CDir*>::iterator it = ecopy.begin();
+ it != ecopy.end();
+ it++)
+ dout(1) << "***** stray item in exports: " << **it << endl;
+ assert(ecopy.size() == 0);
+ }
}
void MDCache::show_cache()
{
dout(7) << "show_cache" << endl;
+
for (hash_map<inodeno_t,CInode*>::iterator it = inode_map.begin();
it != inode_map.end();
it++) {
class Message;
+class MMDSImportMap;
+class MMDSCacheRejoin;
+class MMDSCacheRejoinAck;
class MDiscover;
class MDiscoverReply;
class MCacheExpire;
}
class MDCache {
- protected:
+ public:
// my master
MDS *mds;
+ LRU lru; // dentry lru for expiring items from cache
+
+ protected:
// the cache
CInode *root; // root inode
- LRU lru; // lru for expiring items
hash_map<inodeno_t,CInode*> inode_map; // map of inodes by ino
-
+
+ list<CInode*> inode_expire_queue; // inodes to delete
+
+
// root
list<Context*> waiting_for_root;
set<CDir*> hashdirs;
map<CDir*,set<CDir*> > nested_exports; // exports nested under imports _or_ hashdirs
+ void adjust_export(int to, CDir *root, set<CDir*>& bounds);
+ void adjust_import(int from, CDir *root, set<CDir*>& bounds);
+
+
+
// active MDS requests
hash_map<Message*, active_request_t> active_requests;
// shutdown crap
int shutdown_commits;
bool did_shutdown_exports;
+ bool did_shutdown_log_cap;
friend class C_MDC_ShutdownCommit;
+ // recovery
+protected:
+ // from EImportStart w/o EImportFinish during journal replay
+ map<inodeno_t, set<inodeno_t> > my_ambiguous_imports;
+ // from MMDSImportMaps
+ map<int, map<inodeno_t, set<inodeno_t> > > other_ambiguous_imports;
+
+ set<int> recovery_set;
+ set<int> wants_import_map; // nodes i need to send my import map to
+ set<int> got_import_map; // nodes i need to send my import map to (when exports finish)
+ set<int> rejoin_ack_gather; // nodes i need a rejoin ack from
+
+ void handle_import_map(MMDSImportMap *m);
+ void handle_cache_rejoin(MMDSCacheRejoin *m);
+ void handle_cache_rejoin_ack(MMDSCacheRejoinAck *m);
+ void disambiguate_imports();
+ void cache_rejoin_walk(CDir *dir, MMDSCacheRejoin *rejoin);
+ void send_cache_rejoin_acks();
+public:
+ void send_import_map(int who);
+ void send_import_map_now(int who);
+ void send_import_map_later(int who) {
+ wants_import_map.insert(who);
+ }
+ void send_pending_import_maps(); // maybe.
+ void send_cache_rejoins();
+
+ void set_recovery_set(set<int>& s) {
+ recovery_set = s;
+ }
+
+ // ambiguous imports
+ void add_ambiguous_import(inodeno_t base, set<inodeno_t>& bounds) {
+ my_ambiguous_imports[base].swap(bounds);
+ }
+ void cancel_ambiguous_import(inodeno_t dirino);
+ void finish_ambiguous_import(inodeno_t dirino);
+
+ void finish_ambiguous_export(inodeno_t dirino, set<inodeno_t>& bounds);
+
+
+
+
+
friend class CInode;
friend class Locker;
friend class Migrator;
friend class Renamer;
friend class MDBalancer;
+ friend class EImportMap;
+
public:
+
// subsystems
Migrator *migrator;
Renamer *renamer;
CInode *get_root() { return root; }
void set_root(CInode *r);
+ int get_num_imports() { return imports.size(); }
void add_import(CDir *dir);
void remove_import(CDir *dir);
+ void recalc_auth_bits();
+ void log_import_map(Context *onsync=0);
+
+
// cache
void set_cache_size(size_t max) { lru.lru_set_max(max); }
size_t get_cache_size() { return lru.lru_get_size(); }
bool trim(int max = -1); // trim cache
+ void trim_non_auth(); // trim out trimmable non-auth items
// shutdown
void shutdown_start();
return NULL;
}
+
+ int hash_dentry(inodeno_t ino, const string& s) {
+ return 0; // fixme
+ }
+
+
public:
CInode *create_inode();
void add_inode(CInode *in);
void remove_inode(CInode *in);
void destroy_inode(CInode *in);
void touch_inode(CInode *in) {
- // touch parent(s) too
- if (in->get_parent_dir()) touch_inode(in->get_parent_dir()->inode);
+ if (in->get_parent_dn())
+ touch_dentry(in->get_parent_dn());
+ }
+ void touch_dentry(CDentry *dn) {
+ // touch ancestors
+ if (dn->get_dir()->get_inode()->get_parent_dn())
+ touch_dentry(dn->get_dir()->get_inode()->get_parent_dn());
- // top or mid, depending on whether i'm auth
- if (in->is_auth())
- lru.lru_touch(in);
+ // touch me
+ if (dn->is_auth())
+ lru.lru_touch(dn);
else
- lru.lru_midtouch(in);
+ lru.lru_midtouch(dn);
}
+
+ void inode_remove_replica(CInode *in, int rep);
+
void rename_file(CDentry *srcdn, CDentry *destdn);
public:
void start_recovered_purges();
- protected:
- // private methods
+ public:
CDir *get_auth_container(CDir *in);
+ CDir *get_export_container(CDir *dir);
void find_nested_exports(CDir *dir, set<CDir*>& s);
void find_nested_exports_under(CDir *import, CDir *dir, set<CDir*>& s);
public:
+ CInode *create_root_inode();
int open_root(Context *c);
int path_traverse(filepath& path, vector<CDentry*>& trace, bool follow_trailing_sym,
Message *req, Context *ondelay,
#include "MDLog.h"
#include "MDS.h"
+#include "MDCache.h"
#include "LogEvent.h"
#include "osdc/Journaler.h"
#include "config.h"
#undef dout
-#define dout(l) if (l<=g_conf.debug || l <= g_conf.debug_mds_log) cout << g_clock.now() << " mds" << mds->get_nodeid() << ".log "
-#define derr(l) if (l<=g_conf.debug || l <= g_conf.debug_mds_log) cout << g_clock.now() << " mds" << mds->get_nodeid() << ".log "
+#define dout(l) if (l<=g_conf.debug_mds || l <= g_conf.debug_mds_log) cout << g_clock.now() << " mds" << mds->get_nodeid() << ".log "
+#define derr(l) if (l<=g_conf.debug_mds || l <= g_conf.debug_mds_log) cout << g_clock.now() << " mds" << mds->get_nodeid() << ".log "
// cons/des
num_events = 0;
waiting_for_read = false;
+ last_import_map = 0;
+ writing_import_map = false;
+ seen_import_map = false;
+
max_events = g_conf.mds_log_max_len;
+ capped = false;
+
unflushed = 0;
+ journaler = 0;
+ logger = 0;
+}
+
+
+MDLog::~MDLog()
+{
+ if (journaler) { delete journaler; journaler = 0; }
+ if (logger) { delete logger; logger = 0; }
+}
+
+
+void MDLog::init_journaler()
+{
// logger
char name[80];
sprintf(name, "mds%d.log", mds->get_nodeid());
static bool didit = false;
if (!didit) {
mdlog_logtype.add_inc("add");
- mdlog_logtype.add_inc("retire");
+ mdlog_logtype.add_inc("expire");
mdlog_logtype.add_inc("obs");
mdlog_logtype.add_inc("trim");
mdlog_logtype.add_set("size");
log_inode.layout.object_layout = OBJECT_LAYOUT_STARTOSD;
log_inode.layout.osd = mds->get_nodeid() + 10000; // hack
}
-
+
// log streamer
+ if (journaler) delete journaler;
journaler = new Journaler(log_inode, mds->objecter, logger);
-
}
-MDLog::~MDLog()
-{
- if (journaler) { delete journaler; journaler = 0; }
- if (logger) { delete logger; logger = 0; }
-}
-
void MDLog::reset()
{
+ dout(5) << "reset to empty log" << endl;
+ init_journaler();
journaler->reset();
}
void MDLog::open(Context *c)
{
dout(5) << "open discovering log bounds" << endl;
+ init_journaler();
journaler->recover(c);
}
}
+off_t MDLog::get_read_pos()
+{
+ return journaler->get_read_pos();
+}
+
+off_t MDLog::get_write_pos()
+{
+ return journaler->get_write_pos();
+}
+
+
+
void MDLog::submit_entry( LogEvent *le,
Context *c )
{
- dout(5) << "submit_entry " << journaler->get_write_pos() << " : " << *le << endl;
-
if (g_conf.mds_log) {
+ dout(5) << "submit_entry " << journaler->get_write_pos() << " : " << *le << endl;
+
// encode it, with event type
bufferlist bl;
bl.append((char*)&le->_type, sizeof(le->_type));
// journal it.
journaler->append_entry(bl);
+ assert(!capped);
+
delete le;
num_events++;
else
unflushed++;
+ // should we log a new import_map?
+ // FIXME: should this go elsewhere?
+ if (last_import_map && !writing_import_map &&
+ journaler->get_write_pos() - last_import_map >= g_conf.mds_log_import_map_interval) {
+ // log import map
+ mds->mdcache->log_import_map();
+ }
+
} else {
// hack: log is disabled.
if (c) {
void MDLog::_trimmed(LogEvent *le)
{
- dout(7) << " trimmed " << *le << endl;
-
- assert(le->can_expire(mds));
+ dout(7) << "trimmed : " << le->get_start_off() << " : " << *le << endl;
+ assert(le->has_expired(mds));
if (trimming.begin()->first == le->_end_off) {
- // front! we can expire the log a bit
+ // we trimmed off the front!
+ // we can expire the log a bit.
journaler->set_expire_pos(le->_end_off);
}
trim_waiters.push_back(c);
// trim!
+ dout(10) << "trim " << num_events << " events / " << max_events << " max" << endl;
+
while (num_events > max_events) {
off_t gap = journaler->get_write_pos() - journaler->get_read_pos();
}
bufferlist bl;
+ off_t so = journaler->get_read_pos();
if (journaler->try_read_entry(bl)) {
// decode logevent
LogEvent *le = LogEvent::decode(bl);
+ le->_start_off = so;
le->_end_off = journaler->get_read_pos();
num_events--;
// we just read an event.
- if (le->can_expire(mds) == true) {
+ if (le->has_expired(mds)) {
// obsolete
- dout(7) << "trim obsolete: " << *le << endl;
+ dout(7) << "trim obsolete : " << le->get_start_off() << " : " << *le << endl;
delete le;
logger->inc("obs");
} else {
assert ((int)trimming.size() < g_conf.mds_log_max_trimming);
// trim!
- dout(7) << "trim trimming: " << *le << endl;
+ dout(7) << "trim expiring : " << le->get_start_off() << " : " << *le << endl;
trimming[le->_end_off] = le;
- le->retire(mds, new C_MDL_Trimmed(this, le));
- logger->inc("retire");
+ le->expire(mds, new C_MDL_Trimmed(this, le));
+ logger->inc("expire");
logger->set("trim", trimming.size());
}
logger->set("read", journaler->get_read_pos());
std::list<Context*> finished;
finished.swap(trim_waiters);
finish_contexts(finished, 0);
+
+ // hmm, are we at the end?
+ /*
+ if (journaler->get_read_pos() == journaler->get_write_pos() &&
+ trimming.size() == import_map_expire_waiters.size()) {
+ dout(5) << "trim log is empty, allowing import_map to expire" << endl;
+ list<Context*> ls;
+ ls.swap(import_map_expire_waiters);
+ finish_contexts(ls);
+ }
+ */
}
LogEvent *le = LogEvent::decode(bl);
num_events++;
- if (le->has_happened(mds)) {
+ // have we seen an import map yet?
+ if (!seen_import_map &&
+ le->get_type() != EVENT_IMPORTMAP) {
dout(10) << "_replay " << pos << " / " << journaler->get_write_pos()
- << " : " << *le << " : already happened" << endl;
+ << " -- waiting for import_map. (skipping " << *le << ")" << endl;
} else {
dout(10) << "_replay " << pos << " / " << journaler->get_write_pos()
- << " : " << *le << " : applying" << endl;
+ << " : " << *le << endl;
le->replay(mds);
+
+ if (le->get_type() == EVENT_IMPORTMAP)
+ seen_import_map = true;
}
delete le;
}
int unflushed;
+ bool capped;
+
inode_t log_inode;
Journaler *journaler;
-
- //hash_map<LogEvent*> trimming; // events currently being trimmed
- map<off_t, LogEvent*> trimming;
+ map<off_t,LogEvent*> trimming;
std::list<Context*> trim_waiters; // contexts waiting for trim
bool trim_reading;
list<Context*> waitfor_replay;
+ // importmaps
+ off_t last_import_map; // offsets of last committed importmap. constrains trimming.
+ list<Context*> import_map_expire_waiters;
+ bool writing_import_map; // one is being written now
+ bool seen_import_map; // for recovery
+
+ friend class EImportMap;
+ friend class C_MDS_WroteImportMap;
+ friend class MDCache;
+
+ void init_journaler();
+
+
+ public:
+ // replay state
+ map<inodeno_t, set<inodeno_t> > pending_exports;
+
+
+
public:
MDLog(MDS *m);
~MDLog();
-
+
+
+
void set_max_events(size_t max) { max_events = max; }
size_t get_max_events() { return max_events; }
size_t get_num_events() { return num_events + trimming.size(); }
+ size_t get_non_importmap_events() { return num_events + trimming.size() - import_map_expire_waiters.size(); }
+
+ off_t get_read_pos();
+ off_t get_write_pos();
+ bool empty() {
+ return get_read_pos() == get_write_pos();
+ }
+
+ bool is_capped() { return capped; }
+ void cap() {
+ capped = true;
+ list<Context*> ls;
+ ls.swap(import_map_expire_waiters);
+ finish_contexts(ls);
+ }
void submit_entry( LogEvent *e, Context *c = 0 );
void wait_for_sync( Context *c );
#include "common/Timer.h"
#include "messages/MMDSMap.h"
-#include "messages/MMDSBoot.h"
+#include "messages/MMDSBeacon.h"
#include "messages/MPing.h"
#include "messages/MPingAck.h"
// cons/des
-MDS::MDS(int whoami, Messenger *m, MonMap *mm) {
+MDS::MDS(int whoami, Messenger *m, MonMap *mm) : timer(mds_lock) {
this->whoami = whoami;
monmap = mm;
server = new Server(this);
locker = new Locker(this, mdcache);
+
+ // beacon
+ beacon_last_seq = 0;
+ beacon_sender = 0;
+ beacon_killer = 0;
- req_rate = 0;
+ // tick
+ tick_event = 0;
- state = STATE_BOOTING;
+ req_rate = 0;
- last_balancer_hash = last_balancer_heartbeat = g_clock.recent_now();
+ want_state = state = MDSMap::STATE_DNE;
logger = logger2 = 0;
}
-void MDS::reopen_log()
+void MDS::reopen_logger()
{
// flush+close old log
if (logger) {
void MDS::send_message_mds(Message *m, int mds, int port, int fromport)
{
+ // send mdsmap first?
+ if (peer_mdsmap_epoch[mds] < mdsmap->get_epoch()) {
+ messenger->send_message(new MMDSMap(mdsmap),
+ mdsmap->get_inst(mds));
+ peer_mdsmap_epoch[mds] = mdsmap->get_epoch();
+ }
+
+ // send message
if (port && !fromport)
fromport = port;
- messenger->send_message(m, MSG_ADDR_MDS(mds), mdsmap->get_inst(mds), port, fromport);
+ messenger->send_message(m, mdsmap->get_inst(mds), port, fromport);
}
-int MDS::init()
+class C_MDS_Tick : public Context {
+ MDS *mds;
+public:
+ C_MDS_Tick(MDS *m) : mds(m) {}
+ void finish(int r) {
+ mds->tick();
+ }
+};
+
+
+
+int MDS::init(bool standby)
{
- // request osd map
- dout(5) << "requesting mds and osd maps from mon" << endl;
- int mon = monmap->pick_mon();
- messenger->send_message(new MMDSBoot, MSG_ADDR_MON(mon), monmap->get_inst(mon));
+ mds_lock.Lock();
+
+ if (standby)
+ want_state = MDSMap::STATE_STANDBY;
+ else
+ want_state = MDSMap::STATE_STARTING;
+
+ // starting beacon. this will induce an MDSMap from the monitor
+ beacon_start();
+
+ // schedule tick
+ reset_tick();
+
+ mds_lock.Unlock();
return 0;
}
+void MDS::reset_tick()
+{
+ // cancel old
+ if (tick_event) timer.cancel_event(tick_event);
-void MDS::handle_mds_map(MMDSMap *m)
+ // schedule
+ tick_event = new C_MDS_Tick(this);
+ timer.add_event_after(g_conf.mon_tick_interval, tick_event);
+}
+
+void MDS::tick()
+{
+ // reschedule
+ reset_tick();
+
+ // log
+ mds_load_t load = balancer->get_load();
+
+ if (logger) {
+ req_rate = logger->get("req");
+
+ logger->set("l", (int)load.mds_load());
+ logger->set("q", messenger->get_dispatch_queue_len());
+ logger->set("buf", buffer_total_alloc);
+
+ mdcache->log_stat(logger);
+ }
+
+ // booted?
+ if (is_active()) {
+
+ // balancer
+ balancer->tick();
+
+ // HACK to test hashing stuff
+ if (false) {
+ /*
+ static map<int,int> didhash;
+ if (elapsed.sec() > 15 && !didhash[whoami]) {
+ CInode *in = mdcache->get_inode(100000010);
+ if (in && in->dir) {
+ if (in->dir->is_auth())
+ mdcache->migrator->hash_dir(in->dir);
+ didhash[whoami] = 1;
+ }
+ }
+ if (0 && elapsed.sec() > 25 && didhash[whoami] == 1) {
+ CInode *in = mdcache->get_inode(100000010);
+ if (in && in->dir) {
+ if (in->dir->is_auth() && in->dir->is_hashed())
+ mdcache->migrator->unhash_dir(in->dir);
+ didhash[whoami] = 2;
+ }
+ }
+ */
+ }
+ }
+}
+
+
+
+
+// -----------------------
+// beacons
+
+void MDS::beacon_start()
+{
+ beacon_send(); // send first beacon
+
+ //reset_beacon_killer(); // schedule killer
+}
+
+
+class C_MDS_BeaconSender : public Context {
+ MDS *mds;
+public:
+ C_MDS_BeaconSender(MDS *m) : mds(m) {}
+ void finish(int r) {
+ mds->beacon_send();
+ }
+};
+
+void MDS::beacon_send()
{
- map<epoch_t, bufferlist>::reverse_iterator p = m->maps.rbegin();
+ ++beacon_last_seq;
+ dout(10) << "beacon_send " << MDSMap::get_state_name(want_state)
+ << " seq " << beacon_last_seq
+ << " (currently " << MDSMap::get_state_name(state) << ")"
+ << endl;
+
+ beacon_seq_stamp[beacon_last_seq] = g_clock.now();
+
+ int mon = monmap->pick_mon();
+ messenger->send_message(new MMDSBeacon(want_state, beacon_last_seq),
+ monmap->get_inst(mon));
- dout(1) << "handle_mds_map epoch " << p->first << endl;
- mdsmap->decode(p->second);
+ // schedule next sender
+ if (beacon_sender) timer.cancel_event(beacon_sender);
+ beacon_sender = new C_MDS_BeaconSender(this);
+ timer.add_event_after(g_conf.mds_beacon_interval, beacon_sender);
+}
+
+void MDS::handle_mds_beacon(MMDSBeacon *m)
+{
+ dout(10) << "handle_mds_beacon " << MDSMap::get_state_name(m->get_state())
+ << " seq " << m->get_seq() << endl;
+ version_t seq = m->get_seq();
+
+ // update lab
+ if (beacon_seq_stamp.count(seq)) {
+ assert(beacon_seq_stamp[seq] > beacon_last_acked_stamp);
+ beacon_last_acked_stamp = beacon_seq_stamp[seq];
+
+ // clean up seq_stamp map
+ while (!beacon_seq_stamp.empty() &&
+ beacon_seq_stamp.begin()->first <= seq)
+ beacon_seq_stamp.erase(beacon_seq_stamp.begin());
+
+ reset_beacon_killer();
+ }
delete m;
+}
+
+class C_MDS_BeaconKiller : public Context {
+ MDS *mds;
+ utime_t lab;
+public:
+ C_MDS_BeaconKiller(MDS *m, utime_t l) : mds(m), lab(l) {}
+ void finish(int r) {
+ mds->beacon_kill(lab);
+ }
+};
+
+void MDS::reset_beacon_killer()
+{
+ utime_t when = beacon_last_acked_stamp;
+ when += g_conf.mds_beacon_grace;
+
+ dout(15) << "reset_beacon_killer last_acked_stamp at " << beacon_last_acked_stamp
+ << ", will die at " << when << endl;
+ if (beacon_killer) timer.cancel_event(beacon_killer);
+
+ beacon_killer = new C_MDS_BeaconKiller(this, beacon_last_acked_stamp);
+ timer.add_event_at(when, beacon_killer);
+}
+
+void MDS::beacon_kill(utime_t lab)
+{
+ if (lab == beacon_last_acked_stamp) {
+ dout(0) << "beacon_kill last_acked_stamp " << lab
+ << ", killing myself."
+ << endl;
+ exit(0);
+ } else {
+ dout(20) << "beacon_kill last_acked_stamp " << beacon_last_acked_stamp
+ << " != my " << lab
+ << ", doing nothing."
+ << endl;
+ }
+}
+
+
+
+void MDS::handle_mds_map(MMDSMap *m)
+{
+ version_t epoch = m->get_epoch();
+ dout(1) << "handle_mds_map epoch " << epoch << " from " << m->get_source() << endl;
+
+ // note source's map version
+ if (m->get_source().is_mds() &&
+ peer_mdsmap_epoch[m->get_source().num()] < epoch) {
+ dout(15) << " peer " << m->get_source()
+ << " has mdsmap epoch >= " << epoch
+ << endl;
+ peer_mdsmap_epoch[m->get_source().num()] = epoch;
+ }
+
+ // is it new?
+ if (epoch <= mdsmap->get_epoch()) {
+ dout(1) << " old map epoch " << epoch << " <= " << mdsmap->get_epoch()
+ << ", discarding" << endl;
+ delete m;
+ return;
+ }
+
+ // note some old state
+ int oldwhoami = whoami;
+ int oldstate = state;
+ set<int> oldresolve;
+ mdsmap->get_mds_set(oldresolve, MDSMap::STATE_RESOLVE);
+ bool wasrejoining = mdsmap->is_rejoining();
+ set<int> oldfailed;
+ mdsmap->get_mds_set(oldfailed, MDSMap::STATE_FAILED);
+
+ // decode and process
+ mdsmap->decode(m->get_encoded());
+
// see who i am
- int w = mdsmap->get_inst_rank(messenger->get_myinst());
- if (w != whoami) {
- whoami = w;
- messenger->reset_myaddr(MSG_ADDR_MDS(w));
- reopen_log();
+ whoami = mdsmap->get_inst_rank(messenger->get_myaddr());
+ if (oldwhoami != whoami) {
+ // update messenger.
+ messenger->reset_myname(MSG_ADDR_MDS(whoami));
+
+ // tell objecter my incarnation
+ objecter->set_client_incarnation(mdsmap->get_inc(whoami));
+
+ reopen_logger();
+ dout(1) << "handle_mds_map i am now mds" << whoami << endl;
+
+ // do i need an osdmap?
+ if (oldwhoami < 0) {
+ // we need an osdmap too.
+ int mon = monmap->pick_mon();
+ messenger->send_message(new MOSDGetMap(0),
+ monmap->get_inst(mon));
+ }
}
- dout(1) << "map says i am " << w << endl;
- if (is_booting()) {
- // we need an osdmap too.
- int mon = monmap->pick_mon();
- messenger->send_message(new MOSDGetMap(0),
- MSG_ADDR_MON(mon), monmap->get_inst(mon));
+ // update my state
+ state = mdsmap->get_state(whoami);
+
+ // did it change?
+ if (oldstate != state) {
+ if (state == want_state) {
+ dout(1) << "handle_mds_map new state " << mdsmap->get_state_name(state) << endl;
+ } else {
+ dout(1) << "handle_mds_map new state " << mdsmap->get_state_name(state)
+ << ", although i wanted " << mdsmap->get_state_name(want_state)
+ << endl;
+ want_state = state;
+ }
+
+ // now active?
+ if (is_active()) {
+ dout(1) << "now active" << endl;
+ finish_contexts(waitfor_active); // kick waiters
+ }
+
+ else if (is_replay()) {
+ // initialize gather sets
+ set<int> rs;
+ mdsmap->get_recovery_mds_set(rs);
+ rs.erase(whoami);
+ dout(1) << "now replay. my recovery peers are " << rs << endl;
+ mdcache->set_recovery_set(rs);
+ }
+
+ // now stopping?
+ else if (is_stopping()) {
+ assert(oldstate == MDSMap::STATE_ACTIVE);
+ dout(1) << "now stopping" << endl;
+
+ mdcache->shutdown_start();
+
+ // save anchor table
+ if (mdsmap->get_anchortable() == whoami)
+ anchormgr->save(0); // FIXME? or detect completion via filer?
+
+ if (idalloc)
+ idalloc->save(0); // FIXME? or detect completion via filer?
+
+ // flush log
+ mdlog->set_max_events(0);
+ mdlog->trim(NULL);
+ }
+
+ // now standby?
+ else if (is_stopped()) {
+ assert(oldstate == MDSMap::STATE_STOPPING);
+ dout(1) << "now stopped, sending down:out and exiting" << endl;
+ shutdown_final();
+ }
+ }
+
+
+ // is anyone resolving?
+ if (is_resolve() || is_rejoin() || is_active() || is_stopping()) {
+ set<int> resolve;
+ mdsmap->get_mds_set(resolve, MDSMap::STATE_RESOLVE);
+ if (oldresolve != resolve)
+ dout(10) << "resolve set is " << resolve << ", was " << oldresolve << endl;
+ for (set<int>::iterator p = resolve.begin(); p != resolve.end(); ++p) {
+ if (*p == whoami) continue;
+ if (oldresolve.count(*p) == 0 || // if other guy newly resolve, or
+ oldstate == MDSMap::STATE_REPLAY) // if i'm newly resolve,
+ mdcache->send_import_map(*p); // share my import map (now or later)
+ }
+ }
+
+ // is everybody finally rejoining?
+ if (is_rejoin() || is_active() || is_stopping()) {
+ if (!wasrejoining && mdsmap->is_rejoining()) {
+ mdcache->send_cache_rejoins();
+ }
}
+
+ // did anyone go down?
+ if (is_active() || is_stopping()) {
+ set<int> failed;
+ mdsmap->get_mds_set(failed, MDSMap::STATE_FAILED);
+ for (set<int>::iterator p = failed.begin(); p != failed.end(); ++p) {
+ // newly so?
+ if (oldfailed.count(*p)) continue;
+
+ mdcache->migrator->handle_mds_failure(*p);
+ }
+ }
+
+ delete m;
}
void MDS::handle_osd_map(MOSDMap *m)
{
+ version_t had = osdmap->get_epoch();
+
// process locally
objecter->handle_osd_map(m);
-
- if (is_booting()) {
- // we got our maps. mkfs for recovery?
- if (g_conf.mkfs)
- boot_mkfs();
+
+ if (had == 0) {
+ if (is_creating())
+ boot_create(); // new tables, journal
+ else if (is_starting())
+ boot_start(); // old tables, empty journal
+ else if (is_replay())
+ boot_replay(); // replay, join
else
- boot_recover();
- }
+ assert(is_standby());
+ }
// pass on to clients
for (set<int>::iterator it = clientmap.get_mount_set().begin();
MOSDMap *n = new MOSDMap;
n->maps = m->maps;
n->incremental_maps = m->incremental_maps;
- messenger->send_message(n, MSG_ADDR_CLIENT(*it), clientmap.get_inst(*it));
+ messenger->send_message(n, clientmap.get_inst(*it));
}
}
-class C_MDS_MkfsFinish : public Context {
+class C_MDS_BootFinish : public Context {
MDS *mds;
public:
- C_MDS_MkfsFinish(MDS *m) : mds(m) {}
- void finish(int r) { mds->boot_mkfs_finish(); }
+ C_MDS_BootFinish(MDS *m) : mds(m) {}
+ void finish(int r) { mds->boot_finish(); }
};
-void MDS::boot_mkfs()
+void MDS::boot_create()
{
- dout(3) << "boot_mkfs" << endl;
+ dout(3) << "boot_create" << endl;
+
+ C_Gather *fin = new C_Gather(new C_MDS_BootFinish(this));
- C_Gather *fin = new C_Gather(new C_MDS_MkfsFinish(this));
-
if (whoami == 0) {
- dout(3) << "boot_mkfs - creating root inode and dir" << endl;
+ dout(3) << "boot_create since i am also mds0, creating root inode and dir" << endl;
// create root inode.
mdcache->open_root(0);
// force empty root dir
CDir *dir = root->dir;
dir->mark_complete();
- dir->mark_dirty();
-
+ dir->mark_dirty(dir->pre_dirty());
+
// save it
mdstore->commit_dir(dir, fin->new_sub());
}
-
+
// start with a fresh journal
- dout(10) << "boot_mkfs creating fresh journal" << endl;
+ dout(10) << "boot_create creating fresh journal" << endl;
mdlog->reset();
mdlog->write_head(fin->new_sub());
+
+ // write our first importmap
+ mdcache->log_import_map(fin->new_sub());
// fixme: fake out idalloc (reset, pretend loaded)
- dout(10) << "boot_mkfs creating fresh idalloc table" << endl;
+ dout(10) << "boot_create creating fresh idalloc table" << endl;
idalloc->reset();
idalloc->save(fin->new_sub());
// fixme: fake out anchortable
if (mdsmap->get_anchortable() == whoami) {
- dout(10) << "boot_mkfs creating fresh anchortable" << endl;
+ dout(10) << "boot_create creating fresh anchortable" << endl;
anchormgr->reset();
anchormgr->save(fin->new_sub());
}
}
-void MDS::boot_mkfs_finish()
+void MDS::boot_start()
+{
+ dout(2) << "boot_start" << endl;
+
+ C_Gather *fin = new C_Gather(new C_MDS_BootFinish(this));
+
+ dout(2) << "boot_start opening idalloc" << endl;
+ idalloc->load(fin->new_sub());
+
+ if (mdsmap->get_anchortable() == whoami) {
+ dout(2) << "boot_start opening anchor table" << endl;
+ anchormgr->load(fin->new_sub());
+ } else {
+ dout(2) << "boot_start i have no anchor table" << endl;
+ }
+
+ dout(2) << "boot_start opening mds log" << endl;
+ mdlog->open(fin->new_sub());
+
+ if (mdsmap->get_root() == whoami) {
+ dout(2) << "boot_start opening root directory" << endl;
+ mdcache->open_root(fin->new_sub());
+ }
+}
+
+void MDS::boot_finish()
{
- dout(3) << "boot_mkfs_finish" << endl;
- mark_active();
+ dout(3) << "boot_finish" << endl;
+
+ if (is_starting()) {
+ // make sure mdslog is empty
+ assert(mdlog->get_read_pos() == mdlog->get_write_pos());
+ }
+
+ set_want_state(MDSMap::STATE_ACTIVE);
}
int nextstep;
public:
C_MDS_BootRecover(MDS *m, int n) : mds(m), nextstep(n) {}
- void finish(int r) { mds->boot_recover(nextstep); }
+ void finish(int r) { mds->boot_replay(nextstep); }
};
-void MDS::boot_recover(int step)
+void MDS::boot_replay(int step)
{
- if (is_booting())
- state = STATE_RECOVERING;
-
switch (step) {
case 0:
- if (whoami == 0) {
- dout(2) << "boot_recover " << step << ": creating root inode" << endl;
- mdcache->open_root(0);
- step = 1;
- // fall-thru
- } else {
- // FIXME
- assert(0);
- }
+ step = 1; // fall-thru.
case 1:
- dout(2) << "boot_recover " << step << ": opening idalloc" << endl;
+ dout(2) << "boot_replay " << step << ": opening idalloc" << endl;
idalloc->load(new C_MDS_BootRecover(this, 2));
break;
case 2:
if (mdsmap->get_anchortable() == whoami) {
- dout(2) << "boot_recover " << step << ": opening anchor table" << endl;
+ dout(2) << "boot_replay " << step << ": opening anchor table" << endl;
anchormgr->load(new C_MDS_BootRecover(this, 3));
break;
- } else {
- dout(2) << "boot_recover " << step << ": i have no anchor table" << endl;
- step++;
}
- // fall-thru
+ dout(2) << "boot_replay " << step << ": i have no anchor table" << endl;
+ step++; // fall-thru
case 3:
- dout(2) << "boot_recover " << step << ": opening mds log" << endl;
+ dout(2) << "boot_replay " << step << ": opening mds log" << endl;
mdlog->open(new C_MDS_BootRecover(this, 4));
break;
case 4:
- dout(2) << "boot_recover " << step << ": replaying mds log" << endl;
+ dout(2) << "boot_replay " << step << ": replaying mds log" << endl;
mdlog->replay(new C_MDS_BootRecover(this, 5));
break;
case 5:
- dout(2) << "boot_recover " << step << ": restarting any recovered purges" << endl;
+ dout(2) << "boot_replay " << step << ": restarting any recovered purges" << endl;
mdcache->start_recovered_purges();
- step++;
- // fall-thru
-
+
+ step++; // fall-thru
+
case 6:
- dout(2) << "boot_recover " << step << ": done." << endl;
- mark_active();
+ // done with replay!
+ if (mdsmap->get_num_mds(MDSMap::STATE_ACTIVE) == 0 &&
+ mdsmap->get_num_mds(MDSMap::STATE_STOPPING) == 0 &&
+ mdsmap->get_num_mds(MDSMap::STATE_RESOLVE) == 0 &&
+ mdsmap->get_num_mds(MDSMap::STATE_REJOIN) == 0 &&
+ mdsmap->get_num_mds(MDSMap::STATE_REPLAY) == 1 && // me
+ mdsmap->get_num_mds(MDSMap::STATE_FAILED) == 0) {
+ dout(2) << "boot_replay " << step << ": i am alone, moving to state active" << endl;
+ set_want_state(MDSMap::STATE_ACTIVE);
+ } else {
+ dout(2) << "boot_replay " << step << ": i am not alone, moving to state resolve" << endl;
+ set_want_state(MDSMap::STATE_RESOLVE);
+ }
+ break;
+
}
}
-
-void MDS::mark_active()
+void MDS::set_want_state(int s)
{
- dout(3) << "mark_active" << endl;
- state = STATE_ACTIVE;
- finish_contexts(waitfor_active); // kick waiters
+ dout(3) << "set_want_state " << MDSMap::get_state_name(s) << endl;
+ want_state = s;
+ beacon_send();
}
-
int MDS::shutdown_start()
{
dout(1) << "shutdown_start" << endl;
derr(0) << "mds shutdown start" << endl;
- for (set<int>::iterator p = mdsmap->get_mds().begin();
- p != mdsmap->get_mds().end();
+ // tell everyone to stop.
+ set<int> active;
+ mdsmap->get_active_mds_set(active);
+ for (set<int>::iterator p = active.begin();
+ p != active.end();
p++) {
- dout(1) << "sending MShutdownStart to mds" << *p << endl;
- send_message_mds(new MGenericMessage(MSG_MDS_SHUTDOWNSTART),
- *p, MDS_PORT_MAIN);
+ if (mdsmap->is_up(*p)) {
+ dout(1) << "sending MShutdownStart to mds" << *p << endl;
+ send_message_mds(new MGenericMessage(MSG_MDS_SHUTDOWNSTART),
+ *p, MDS_PORT_MAIN);
+ }
}
- if (idalloc) idalloc->shutdown();
-
- handle_shutdown_start(NULL);
+ // go
+ set_want_state(MDSMap::STATE_STOPPING);
return 0;
}
dout(1) << " handle_shutdown_start" << endl;
// set flag
- state = STATE_STOPPING;
-
- mdcache->shutdown_start();
-
- // save anchor table
- if (mdsmap->get_anchortable() == whoami)
- anchormgr->save(0); // FIXME FIXME
+ set_want_state(MDSMap::STATE_STOPPING);
- // flush log
- mdlog->set_max_events(0);
- mdlog->trim(NULL);
-
- if (m) delete m;
-
- //g_conf.debug_mds = 10;
+ delete m;
}
int MDS::shutdown_final()
{
- dout(1) << "shutdown" << endl;
-
- state = STATE_STOPPED;
+ dout(1) << "shutdown_final" << endl;
+
+ // send final down:out beacon (it doesn't matter if this arrives)
+ set_want_state(MDSMap::STATE_OUT);
+
+ // stop timers
+ if (beacon_killer) {
+ timer.cancel_event(beacon_killer);
+ beacon_killer = 0;
+ }
+ if (beacon_sender) {
+ timer.cancel_event(beacon_sender);
+ beacon_sender = 0;
+ }
+ if (tick_event) {
+ timer.cancel_event(tick_event);
+ tick_event = 0;
+ }
+ timer.cancel_all();
+ timer.join();
// shut down cache
mdcache->shutdown();
-
- // tell monitor
- messenger->send_message(new MGenericMessage(MSG_SHUTDOWN),
- MSG_ADDR_MON(0), monmap->get_inst(0));
-
+
// shut down messenger
messenger->shutdown();
-
+
return 0;
}
+
void MDS::dispatch(Message *m)
{
- // make sure we advacne the clock
- g_clock.now();
-
- // process
mds_lock.Lock();
my_dispatch(m);
mds_lock.Unlock();
void MDS::my_dispatch(Message *m)
{
+ // from bad mds?
+ if (m->get_source().is_mds()) {
+ int from = m->get_source().num();
+ if (!mdsmap->have_inst(from) ||
+ mdsmap->get_inst(from) != m->get_source_inst()) {
+ // bogus mds?
+ if (m->get_type() != MSG_MDS_MAP) {
+ dout(5) << "got " << *m << " from old/bad/imposter mds " << m->get_source()
+ << ", dropping" << endl;
+ delete m;
+ return;
+ } else {
+ dout(5) << "got " << *m << " from old/bad/imposter mds " << m->get_source()
+ << ", but it's an mdsmap, looking at it" << endl;
+ }
+ }
+ }
+
switch (m->get_dest_port()) {
// HACK FOR NOW
- /*
- static bool did_heartbeat_hack = false;
- if (!shutting_down && !shut_down &&
- false &&
- !did_heartbeat_hack) {
- osdmonitor->initiate_heartbeat();
- did_heartbeat_hack = true;
- }
- */
-
-
if (is_active()) {
// flush log to disk after every op. for now.
mdlog->flush();
- // hash root?
+ // hack: force hash root?
if (false &&
mdcache->get_root() &&
mdcache->get_root()->dir &&
}
- // periodic crap (1-second resolution)
- static utime_t last_log = g_clock.recent_now();
- utime_t now = g_clock.recent_now();
- if (is_active() &&
- last_log.sec() != now.sec()) {
-
- // log
- last_log = now;
- mds_load_t load = balancer->get_load();
-
- if (logger) {
- req_rate = logger->get("req");
-
- logger->set("l", (int)load.mds_load());
- logger->set("q", messenger->get_dispatch_queue_len());
- logger->set("buf", buffer_total_alloc);
-
- mdcache->log_stat(logger);
- }
- // balance?
- static int num_bal_times = g_conf.mds_bal_max;
- static utime_t first = g_clock.recent_now();
- utime_t elapsed = now;
- elapsed -= first;
- if (true &&
- whoami == 0 &&
- (num_bal_times || (g_conf.mds_bal_max_until >= 0 && elapsed.sec() > g_conf.mds_bal_max_until)) &&
- !is_stopping() && !is_stopped() &&
- now.sec() - last_balancer_heartbeat.sec() >= g_conf.mds_bal_interval) {
- last_balancer_heartbeat = now;
- balancer->send_heartbeat();
- num_bal_times--;
- }
-
- // hash?
- if (true &&
- g_conf.num_mds > 1 &&
- now.sec() - last_balancer_hash.sec() > g_conf.mds_bal_hash_interval) {
- last_balancer_hash = now;
- balancer->do_hashing();
- }
-
-
-
- // HACK to test hashing stuff
- if (false) {
- static map<int,int> didhash;
- if (elapsed.sec() > 15 && !didhash[whoami]) {
- CInode *in = mdcache->get_inode(100000010);
- if (in && in->dir) {
- if (in->dir->is_auth())
- mdcache->migrator->hash_dir(in->dir);
- didhash[whoami] = 1;
- }
- }
- if (0 && elapsed.sec() > 25 && didhash[whoami] == 1) {
- CInode *in = mdcache->get_inode(100000010);
- if (in && in->dir) {
- if (in->dir->is_auth() && in->dir->is_hashed())
- mdcache->migrator->unhash_dir(in->dir);
- didhash[whoami] = 2;
- }
- }
- }
-
-
-
- }
-
// HACK to force export to test foreign renames
if (false && whoami == 0) {
static bool didit = false;
// 7 to 1
CInode *in = mdcache->get_inode(1001);
if (in && in->is_dir() && !didit) {
- CDir *dir = in->get_or_open_dir(this);
+ CDir *dir = in->get_or_open_dir(mdcache);
if (dir->is_auth()) {
dout(1) << "FORCING EXPORT" << endl;
mdcache->migrator->export_dir(dir,1);
// shut down?
if (is_stopping()) {
if (mdcache->shutdown_pass()) {
- dout(7) << "shutdown_pass=true, finished w/ shutdown" << endl;
- shutdown_final();
+ dout(7) << "shutdown_pass=true, finished w/ shutdown, moving to up:stopped" << endl;
+
+ // tell monitor we shut down cleanly.
+ set_want_state(MDSMap::STATE_STOPPED);
}
}
handle_mds_map((MMDSMap*)m);
return;
+ case MSG_MDS_BEACON:
+ handle_mds_beacon((MMDSBeacon*)m);
+ return;
+
case MSG_MDS_SHUTDOWNSTART: // mds0 -> mds1+
handle_shutdown_start(m);
return;
-
-
case MSG_PING:
handle_ping((MPing*)m);
return;
+
+ default:
+ assert(0);
}
}
dout(10) << " received ping from " << m->get_source() << " with seq " << m->seq << endl;
messenger->send_message(new MPingAck(m),
- m->get_source(), m->get_source_inst());
+ m->get_source_inst());
delete m;
}
#include <ext/hash_map>
using namespace __gnu_cxx;
+#include "mdstypes.h"
+
#include "msg/Dispatcher.h"
#include "include/types.h"
#include "include/Context.h"
#include "common/DecayCounter.h"
#include "common/Logger.h"
#include "common/Mutex.h"
+#include "common/Cond.h"
+#include "common/Timer.h"
#include "mon/MonMap.h"
+#include "MDSMap.h"
#include "ClientMap.h"
class filepath;
-class MDSMap;
class OSDMap;
class Objecter;
class Filer;
class MHashReaddir;
class MHashReaddirReply;
-
+class MMDSBeacon;
class MDS : public Dispatcher {
public:
Mutex mds_lock;
+ SafeTimer timer;
+
protected:
int whoami;
Logger *logger, *logger2;
-
protected:
// -- MDS state --
- static const int STATE_BOOTING = 1; // fetching mds and osd maps
- static const int STATE_MKFS = 2; // creating a file system
- static const int STATE_RECOVERING = 3; // recovering mds log
- static const int STATE_ACTIVE = 4; // up and active!
- static const int STATE_STOPPING = 5;
- static const int STATE_STOPPED = 6;
-
- int state;
+ int state; // my confirmed state
+ int want_state; // the state i want
list<Context*> waitfor_active;
-public:
+ map<int,version_t> peer_mdsmap_epoch;
+
+ public:
void queue_waitfor_active(Context *c) { waitfor_active.push_back(c); }
- bool is_booting() { return state == STATE_BOOTING; }
- bool is_recovering() { return state == STATE_RECOVERING; }
- bool is_active() { return state == STATE_ACTIVE; }
- bool is_stopping() { return state == STATE_STOPPING; }
- bool is_stopped() { return state == STATE_STOPPED; }
+ bool is_dne() { return state == MDSMap::STATE_DNE; }
+ bool is_out() { return state == MDSMap::STATE_OUT; }
+ bool is_failed() { return state == MDSMap::STATE_FAILED; }
+ bool is_creating() { return state == MDSMap::STATE_CREATING; }
+ bool is_starting() { return state == MDSMap::STATE_STARTING; }
+ bool is_standby() { return state == MDSMap::STATE_STANDBY; }
+ bool is_replay() { return state == MDSMap::STATE_REPLAY; }
+ bool is_resolve() { return state == MDSMap::STATE_RESOLVE; }
+ bool is_rejoin() { return state == MDSMap::STATE_REJOIN; }
+ bool is_active() { return state == MDSMap::STATE_ACTIVE; }
+ bool is_stopping() { return state == MDSMap::STATE_STOPPING; }
+ bool is_stopped() { return state == MDSMap::STATE_STOPPED; }
- void mark_active();
+ void set_want_state(int s);
// -- waiters --
finished_queue.splice( finished_queue.end(), ls );
}
+ // -- keepalive beacon --
+ version_t beacon_last_seq; // last seq sent to monitor
+ map<version_t,utime_t> beacon_seq_stamp; // seq # -> time sent
+ utime_t beacon_last_acked_stamp; // last time we sent a beacon that got acked
+ Context *beacon_sender;
+ Context *beacon_killer; // next scheduled time of death
+ // tick and other timer fun
+ Context *tick_event;
+ void reset_tick();
+
+
// shutdown crap
int req_rate;
friend class MDStore;
-
- public:
-
- protected:
- utime_t last_balancer_heartbeat, last_balancer_hash;
-
+
public:
MDS(int whoami, Messenger *m, MonMap *mm);
~MDS();
void send_message_mds(Message *m, int mds, int port=0, int fromport=0);
// start up, shutdown
- int init();
- void reopen_log();
+ int init(bool standby=false);
+ void reopen_logger();
- void boot_mkfs();
- void boot_mkfs_finish();
- void boot_recover(int step=0);
+ void boot_create(); // i am new mds.
+ void boot_start(); // i am old but empty (was down:out) mds.
+ void boot_replay(int step=0); // i am recovering existing (down:failed) mds.
+ void boot_finish();
int shutdown_start();
int shutdown_final();
- int hash_dentry(inodeno_t ino, const string& s) {
- return 0; // fixme
- }
+ void tick();
+ void beacon_start();
+ void beacon_send();
+ void beacon_kill(utime_t lab);
+ void handle_mds_beacon(MMDSBeacon *m);
+ void reset_beacon_killer();
// messages
void proc_message(Message *m);
using namespace std;
class MDSMap {
+ public:
+ // mds states
+ static const int STATE_DNE = 0; // down, never existed.
+ static const int STATE_OUT = 1; // down, once existed, but no imports, empty log.
+ static const int STATE_FAILED = 2; // down, holds (er, held) metadata; needs to be recovered.
+
+ static const int STATE_STANDBY = 3; // up, but inactive. waiting for assignment by monitor.
+ static const int STATE_CREATING = 4; // up, creating MDS instance (new journal, idalloc..)
+ static const int STATE_STARTING = 5; // up, starting prior out MDS instance.
+ static const int STATE_REPLAY = 6; // up, scanning journal, recoverying any shared state
+ static const int STATE_RESOLVE = 7; // up, disambiguating partial distributed operations (import/export, ...rename?)
+ static const int STATE_REJOIN = 8; // up, replayed journal, rejoining distributed cache
+ static const int STATE_ACTIVE = 9; // up, active
+ static const int STATE_STOPPING = 10; // up, exporting metadata (-> standby or out)
+ static const int STATE_STOPPED = 11; // up, finished stopping. like standby, but not avail to takeover.
+
+ static const char *get_state_name(int s) {
+ switch (s) {
+ // down
+ case STATE_DNE: return "down:dne";
+ case STATE_OUT: return "down:out";
+ case STATE_FAILED: return "down:failed";
+ // up
+ case STATE_STANDBY: return "up:standby";
+ case STATE_CREATING: return "up:creating";
+ case STATE_STARTING: return "up:starting";
+ case STATE_REPLAY: return "up:replay";
+ case STATE_RESOLVE: return "up:resolve";
+ case STATE_REJOIN: return "up:rejoin";
+ case STATE_ACTIVE: return "up:active";
+ case STATE_STOPPING: return "up:stopping";
+ case STATE_STOPPED: return "up:stopped";
+ default: assert(0);
+ }
+ return 0;
+ }
+
protected:
epoch_t epoch;
utime_t ctime;
- int anchortable;
+ int anchortable; // which MDS has anchortable (fixme someday)
+ int root; // which MDS has root directory
- set<int> all_mds;
- set<int> down_mds;
- map<int,entity_inst_t> mds_inst;
+ set<int> mds_created; // which mds ids have initialized journals and id tables.
+ map<int,int> mds_state; // MDS state
+ map<int,version_t> mds_state_seq;
+ map<int,entity_inst_t> mds_inst; // up instances
+ map<int,int> mds_inc; // incarnation count (monotonically increases)
friend class MDSMonitor;
public:
- MDSMap() : epoch(0), anchortable(0) {}
+ MDSMap() : epoch(0), anchortable(0), root(0) {}
epoch_t get_epoch() const { return epoch; }
void inc_epoch() { epoch++; }
const utime_t& get_ctime() const { return ctime; }
int get_anchortable() const { return anchortable; }
+ int get_root() const { return root; }
+
+ // counts
+ int get_num_mds() const { return mds_state.size(); }
+ int get_num_mds(int state) {
+ int n = 0;
+ for (map<int,int>::const_iterator p = mds_state.begin();
+ p != mds_state.end();
+ p++)
+ if (p->second == state) ++n;
+ return n;
+ }
+ int get_num_up_mds() {
+ int n = 0;
+ for (map<int,int>::const_iterator p = mds_state.begin();
+ p != mds_state.end();
+ p++)
+ if (is_up(p->first)) ++n;
+ return n;
+ }
+ int get_num_up_or_failed_mds() {
+ int n = 0;
+ for (map<int,int>::const_iterator p = mds_state.begin();
+ p != mds_state.end();
+ p++)
+ if (is_up(p->first) || is_failed(p->first))
+ ++n;
+ return n;
+ }
+
+ // sets
+ void get_mds_set(set<int>& s) {
+ s.clear();
+ for (map<int,int>::const_iterator p = mds_state.begin();
+ p != mds_state.end();
+ p++)
+ s.insert(p->first);
+ }
+ void get_up_mds_set(set<int>& s) {
+ s.clear();
+ for (map<int,int>::const_iterator p = mds_state.begin();
+ p != mds_state.end();
+ p++)
+ if (is_up(p->first))
+ s.insert(p->first);
+ }
+ void get_mds_set(set<int>& s, int state) {
+ s.clear();
+ for (map<int,int>::const_iterator p = mds_state.begin();
+ p != mds_state.end();
+ p++)
+ if (p->second == state)
+ s.insert(p->first);
+ }
+ void get_active_mds_set(set<int>& s) {
+ get_mds_set(s, MDSMap::STATE_ACTIVE);
+ }
+ void get_failed_mds_set(set<int>& s) {
+ get_mds_set(s, MDSMap::STATE_FAILED);
+ }
+ void get_recovery_mds_set(set<int>& s) {
+ s.clear();
+ for (map<int,int>::const_iterator p = mds_state.begin();
+ p != mds_state.end();
+ p++)
+ if (is_failed(p->first) ||
+ is_replay(p->first) || is_resolve(p->first) || is_rejoin(p->first) ||
+ is_active(p->first) || is_stopping(p->first))
+ s.insert(p->first);
+ }
+
+
+ // mds states
+ bool is_down(int m) { return is_dne(m) || is_out(m) || is_failed(m); }
+ bool is_up(int m) { return !is_down(m); }
+
+ bool is_dne(int m) { return mds_state.count(m) == 0 || mds_state[m] == STATE_DNE; }
+ bool is_out(int m) { return mds_state.count(m) && mds_state[m] == STATE_OUT; }
+ bool is_failed(int m) { return mds_state.count(m) && mds_state[m] == STATE_FAILED; }
- int get_num_mds() const { return all_mds.size(); }
- int get_num_up_mds() const { return all_mds.size() - down_mds.size(); }
+ bool is_standby(int m) { return mds_state.count(m) && mds_state[m] == STATE_STANDBY; }
+ bool is_creating(int m) { return mds_state.count(m) && mds_state[m] == STATE_CREATING; }
+ bool is_starting(int m) { return mds_state.count(m) && mds_state[m] == STATE_STARTING; }
+ bool is_replay(int m) { return mds_state.count(m) && mds_state[m] == STATE_REPLAY; }
+ bool is_resolve(int m) { return mds_state.count(m) && mds_state[m] == STATE_RESOLVE; }
+ bool is_rejoin(int m) { return mds_state.count(m) && mds_state[m] == STATE_REJOIN; }
+ bool is_active(int m) { return mds_state.count(m) && mds_state[m] == STATE_ACTIVE; }
+ bool is_stopping(int m) { return mds_state.count(m) && mds_state[m] == STATE_STOPPING; }
+ bool is_stopped(int m) { return mds_state.count(m) && mds_state[m] == STATE_STOPPED; }
+
+ bool has_created(int m) { return mds_created.count(m); }
+
+ // cluster states
+ bool is_degraded() { // degraded = some recovery in process. fixes active membership and recovery_set.
+ return get_num_mds(STATE_REPLAY) +
+ get_num_mds(STATE_RESOLVE) +
+ get_num_mds(STATE_REJOIN) +
+ get_num_mds(STATE_FAILED);
+ }
+ /*bool is_resolving() { // nodes are resolving distributed ops
+ return get_num_mds(STATE_RESOLVE);
+ }*/
+ bool is_rejoining() {
+ // nodes are rejoining cache state
+ return get_num_mds(STATE_REJOIN) > 0 &&
+ get_num_mds(STATE_RESOLVE) == 0 &&
+ get_num_mds(STATE_REPLAY) == 0 &&
+ get_num_mds(STATE_FAILED) == 0;
+ }
- const set<int>& get_mds() const { return all_mds; }
- const set<int>& get_down_mds() const { return down_mds; }
- bool is_down(int m) const { return down_mds.count(m); }
- bool is_up(int m) const { return !is_down(m); }
+ int get_state(int m) {
+ if (mds_state.count(m)) return mds_state[m];
+ return STATE_OUT;
+ }
+ // inst
+ bool have_inst(int m) {
+ return mds_inst.count(m);
+ }
const entity_inst_t& get_inst(int m) {
assert(mds_inst.count(m));
return mds_inst[m];
return false;
}
- int get_inst_rank(const entity_inst_t& inst) {
+ int get_inst_rank(const entity_addr_t& addr) {
for (map<int,entity_inst_t>::iterator p = mds_inst.begin();
p != mds_inst.end();
++p) {
- if (p->second == inst) return p->first;
+ if (p->second.addr == addr) return p->first;
}
+ /*else
+ for (map<int,entity_inst_t>::iterator p = mds_inst.begin();
+ p != mds_inst.end();
+ ++p) {
+ if (memcmp(&p->second.addr,&inst.addr, sizeof(inst.addr)) == 0) return p->first;
+ }
+ */
+
return -1;
}
+ int get_inc(int m) {
+ assert(mds_inc.count(m));
+ return mds_inc[m];
+ }
+
+
+ void remove_mds(int m) {
+ mds_inst.erase(m);
+ mds_state.erase(m);
+ mds_state_seq.erase(m);
+ }
+
// serialize, unserialize
void encode(bufferlist& blist) {
blist.append((char*)&epoch, sizeof(epoch));
blist.append((char*)&ctime, sizeof(ctime));
blist.append((char*)&anchortable, sizeof(anchortable));
+ blist.append((char*)&root, sizeof(root));
- _encode(all_mds, blist);
- _encode(down_mds, blist);
- _encode(mds_inst, blist);
+ ::_encode(mds_state, blist);
+ ::_encode(mds_state_seq, blist);
+ ::_encode(mds_inst, blist);
+ ::_encode(mds_inc, blist);
}
void decode(bufferlist& blist) {
off += sizeof(ctime);
blist.copy(off, sizeof(anchortable), (char*)&anchortable);
off += sizeof(anchortable);
+ blist.copy(off, sizeof(root), (char*)&root);
+ off += sizeof(root);
- _decode(all_mds, blist, off);
- _decode(down_mds, blist, off);
- _decode(mds_inst, blist, off);
+ ::_decode(mds_state, blist, off);
+ ::_decode(mds_state_seq, blist, off);
+ ::_decode(mds_inst, blist, off);
+ ::_decode(mds_inc, blist, off);
}
}
// make sure we have a CDir
- CDir *dir = idir->get_or_open_dir(mds);
+ CDir *dir = idir->get_or_open_dir(mds->mdcache);
// do it
dout(7) << "fetch_dir_hash_2 hashcode " << hashcode << " dir " << *dir << endl;
// what to do?
if (hashcode >= 0) {
- int dentryhashcode = mds->hash_dentry( dir->ino(), dname );
+ int dentryhashcode = mds->mdcache->hash_dentry( dir->ino(), dname );
assert(dentryhashcode == hashcode);
}
// what to do?
if (hashcode >= 0) {
- int dentryhashcode = mds->hash_dentry( dir->ino(), dname );
+ int dentryhashcode = mds->mdcache->hash_dentry( dir->ino(), dname );
assert(dentryhashcode == hashcode);
}
dout(12) << "readdir had dentry " << dname << endl;
// under water?
- if (dn->get_inode()->get_parent_dir_version() <= got_version) {
+ if (dn->get_version() <= got_version) {
+ assert(dn->get_inode()->get_version() <= got_version);
dout(10) << "readdir had underwater dentry " << dname << " and inode, marking clean" << endl;
- dn->get_inode()->mark_clean();
dn->mark_clean();
+ dn->get_inode()->mark_clean();
}
}
continue;
CInode *in = mds->mdcache->get_inode(ino);
assert(in && in->dir);
if (in && in->dir && in->dir->is_auth()) {
- dout(7) << "CommitDirVerify: current version = " << in->dir->get_version() << endl;
- dout(7) << "CommitDirVerify: last committed = " << in->dir->get_last_committed_version() << endl;
- dout(7) << "CommitDirVerify: required = " << version << endl;
+ dout(7) << "CommitDirVerify: current = " << in->dir->get_version()
+ << ", last committed = " << in->dir->get_last_committed_version()
+ << ", required = " << version << endl;
if (in->dir->get_last_committed_version() >= version) {
dout(7) << "my required version is safe, done." << endl;
+ if (c) {
+ c->finish(0);
+ delete c;
+ }
} else {
dout(7) << "my required version is still not safe, committing again." << endl;
mds->mdstore->commit_dir(in->dir,
version,
c);
- return;
}
+ return;
}
- }
-
+ }
+
// must have exported ors omethign!
dout(7) << "can't retry commit dir on " << ino << ", must have exported?" << endl;
+
+ // finish.
if (c) {
c->finish(-1);
delete c;
CDentry *dn = it->second;
if (hashcode >= 0) {
- int dentryhashcode = mds->hash_dentry( dir->ino(), it->first );
+ int dentryhashcode = mds->mdcache->hash_dentry( dir->ino(), it->first );
if (dentryhashcode != hashcode) continue;
}
- // put dentry in this version
- if (dn->is_dirty()) {
- dn->float_parent_dir_version( dir->get_version() );
- dout(12) << " dirty dn " << *dn << " now " << dn->get_parent_dir_version() << endl;
- }
-
if (dn->is_null()) continue; // skipping negative entry
// primary or remote?
dout(18) << " inlcuding symlink ptr " << in->symlink << endl;
dirdata.append( (char*) in->symlink.c_str(), in->symlink.length() + 1);
}
-
- // put inode in this dir version
- if (in->is_dirty()) {
- in->float_parent_dir_version( dir->get_version() );
- dout(12) << " dirty inode " << *in << " now " << in->get_parent_dir_version() << endl;
-
- in->set_committing_version( in->get_version() );
- assert(in->get_last_committed_version() < in->get_committing_version());
- } else {
- assert(in->get_committing_version() == in->get_version());
- }
-
}
num++;
it++;
if (hashcode >= 0) {
- int dentryhashcode = mds->hash_dentry( dir->ino(), dn->get_name() );
+ int dentryhashcode = mds->mdcache->hash_dentry( dir->ino(), dn->get_name() );
if (dentryhashcode != hashcode) continue;
}
// dentry
- if (committed_version > dn->get_parent_dir_version()) {
- dout(15) << " dir " << committed_version << " > dn " << dn->get_parent_dir_version() << " still clean " << *dn << endl;
- assert(!dn->is_dirty());
- }
- else if (dn->get_parent_dir_version() == committed_version) {
- dout(15) << " dir " << committed_version << " == dn " << dn->get_parent_dir_version() << " now clean " << *dn << endl;
- if (dn->is_dirty())
- dn->mark_clean(); // might not but could be dirty
-
- // remove, if it's null and unlocked
- if (dn->is_null() && dn->is_sync()) {
- dout(15) << " removing clean and null " << *dn << endl;
- null_clean.push_back(dn);
- continue;
- }
+ if (committed_version >= dn->get_version()) {
+ if (dn->is_dirty()) {
+ dout(15) << " dir " << committed_version << " >= dn " << dn->get_version() << " now clean " << *dn << endl;
+ dn->mark_clean();
+ }
} else {
- dout(15) << " dir " << committed_version << " < dn " << dn->get_parent_dir_version() << " still dirty " << *dn << endl;
- assert(committed_version < dn->get_parent_dir_version());
- //assert(dn->is_dirty() || !dn->is_sync()); // -OR- we did a fetch_dir in order to do a newer commit...
+ dout(15) << " dir " << committed_version << " < dn " << dn->get_version() << " still dirty " << *dn << endl;
}
// only do primary...
- if (!dn->is_primary()) continue;
+ if (!dn->is_primary())
+ continue;
CInode *in = dn->get_inode();
assert(in);
assert(in->is_auth());
- if (in->get_committing_version())
- in->set_committed_version();
-
- if (committed_version > in->get_parent_dir_version()) {
- dout(15) << " dir " << committed_version << " > inode " << in->get_parent_dir_version() << " still clean " << *(in) << endl;
- assert(!in->is_dirty());
- }
- else if (in->get_parent_dir_version() == committed_version) {
- dout(15) << " dir " << committed_version << " == inode " << in->get_parent_dir_version() << " now clean " << *(in) << endl;
- in->mark_clean(); // might not but could be dirty
+ if (committed_version >= in->get_version()) {
+ if (in->is_dirty()) {
+ dout(15) << " dir " << committed_version << " >= inode " << in->get_version() << " now clean " << *in << endl;
+ in->mark_clean();
+ }
} else {
- dout(15) << " dir " << committed_version << " < inode " << in->get_parent_dir_version() << " still dirty " << *(in) << endl;
- assert(committed_version < in->get_parent_dir_version());
- //assert(in->is_dirty()); // -OR- we did a fetch_dir in order to do a newer commit...
+ dout(15) << " dir " << committed_version << " < inode " << in->get_version() << " still dirty " << *in << endl;
+ assert(in->is_dirty());
}
}
- // remove null clean dentries
- for (list<CDentry*>::iterator it = null_clean.begin();
- it != null_clean.end();
- it++)
- dir->remove_dentry(*it);
-
// unpin
dir->auth_unpin();
#include "CDentry.h"
#include "Migrator.h"
#include "Locker.h"
+#include "MDStore.h"
+#include "Migrator.h"
#include "MDBalancer.h"
#include "MDLog.h"
#include "include/filepath.h"
-#include "events/EInodeUpdate.h"
-#include "events/EDirUpdate.h"
+#include "events/EString.h"
+#include "events/EExportStart.h"
+#include "events/EExportFinish.h"
+#include "events/EImportStart.h"
+#include "events/EImportFinish.h"
#include "msg/Messenger.h"
#include "messages/MUnhashDirNotifyAck.h"
+#include "config.h"
+#undef dout
+#define dout(l) if (l<=g_conf.debug || l <= g_conf.debug_mds) cout << g_clock.now() << " mds" << mds->get_nodeid() << ".migrator "
+
+
void Migrator::dispatch(Message *m)
{
}
+
+
+// ==========================================================
+// mds failure handling
+
+void Migrator::handle_mds_failure(int who)
+{
+ dout(5) << "handle_mds_failure mds" << who << endl;
+
+ // check my exports
+ map<CDir*,int>::iterator p = export_state.begin();
+ while (p != export_state.end()) {
+ map<CDir*,int>::iterator next = p;
+ next++;
+ CDir *dir = p->first;
+
+ if (export_peer[dir] == who) {
+ // the guy i'm exporting to failed.
+ // clean up.
+ dout(10) << "cleaning up export state " << p->second << " of " << *dir << endl;
+
+ switch (p->second) {
+ case EXPORT_DISCOVERING:
+ dout(10) << "state discovering : canceling freeze and removing auth_pin" << endl;
+ dir->unfreeze_tree(); // cancel the freeze
+ dir->auth_unpin(); // remove the auth_pin (that was holding up the freeze)
+ break;
+
+ case EXPORT_FREEZING:
+ dout(10) << "state freezing : canceling freeze" << endl;
+ dir->unfreeze_tree(); // cancel the freeze
+ break;
+
+ case EXPORT_LOGGINGSTART:
+ case EXPORT_PREPPING:
+ dout(10) << "state loggingstart|prepping : logging EExportFinish(false)" << endl;
+ mds->mdlog->submit_entry(new EExportFinish(dir,false));
+ // logger will unfreeze.
+ break;
+
+ case EXPORT_EXPORTING:
+ dout(10) << "state exporting : logging EExportFinish(false), reversing, and unfreezing" << endl;
+ mds->mdlog->submit_entry(new EExportFinish(dir,false));
+ reverse_export(dir);
+ dir->unfreeze_tree();
+ break;
+
+ case EXPORT_LOGGINGFINISH:
+ dout(10) << "state loggingfinish : doing nothing, we were successful." << endl;
+ break;
+
+ default:
+ assert(0);
+ }
+
+ export_state.erase(dir);
+ export_peer.erase(dir);
+
+ // unpin the path
+ vector<CDentry*> trace;
+ cache->make_trace(trace, dir->inode);
+ cache->path_unpin(trace, 0);
+
+ // wake up any waiters
+ mds->queue_finished(export_finish_waiters[dir]);
+ export_finish_waiters.erase(dir);
+
+ // send pending import_maps?
+ mds->mdcache->send_pending_import_maps();
+
+ mds->mdcache->show_imports();
+ mds->mdcache->show_cache();
+ } else {
+ // third party failed. potential peripheral damage?
+ if (p->second == EXPORT_EXPORTING) {
+ // yeah, i'm waiting for acks, let's fake theirs.
+ if (export_notify_ack_waiting[dir].count(who)) {
+ dout(10) << "faking export_dir_notify_ack from mds" << who
+ << " on " << *dir << " to mds" << export_peer[dir]
+ << endl;
+ export_notify_ack_waiting[dir].erase(who);
+ if (export_notify_ack_waiting[dir].empty())
+ export_dir_acked(dir);
+ }
+ }
+ }
+
+ // next!
+ p = next;
+ }
+
+
+ // check my imports
+ map<inodeno_t,int>::iterator q = import_state.begin();
+ while (q != import_state.end()) {
+ map<inodeno_t,int>::iterator next = q;
+ next++;
+ inodeno_t dirino = q->first;
+ CInode *diri = mds->mdcache->get_inode(dirino);
+ CDir *dir = 0;
+ if (diri)
+ dir = diri->dir;
+
+ if (import_peer[dirino] == who) {
+ switch (import_peer[dirino]) {
+ case IMPORT_DISCOVERED:
+
+ break;
+
+ case IMPORT_PREPPING:
+
+ break;
+
+ case IMPORT_PREPPED:
+
+ break;
+
+ case IMPORT_LOGGINGSTART:
+
+ break;
+
+ case IMPORT_ACKING:
+ // hrm. make this an ambiguous import, and wait for exporter recovery to disambiguate
+ // ...
+ break;
+
+ case IMPORT_LOGGINGFINISH:
+ // do nothing, exporter is no longer involved.
+ break;
+ }
+ }
+
+ // next!
+ q = next;
+ }
+}
+
+
+
+
+
+
// ==========================================================
-// IMPORT/EXPORT
+// EXPORT
class C_MDC_ExportFreeze : public Context {
C_MDC_ExportFreeze(Migrator *m, CDir *e, int d) :
mig(m), ex(e), dest(d) {}
virtual void finish(int r) {
- mig->export_dir_frozen(ex, dest);
+ if (r >= 0)
+ mig->export_dir_frozen(ex, dest);
}
};
* will fail if the directory is freezing, frozen, unpinnable, or root.
*/
void Migrator::export_dir(CDir *dir,
- int dest)
+ int dest)
{
dout(7) << "export_dir " << *dir << " to " << dest << endl;
assert(dest != mds->get_nodeid());
assert(!dir->is_hashed());
+ if (mds->mdsmap->is_degraded()) {
+ dout(7) << "cluster degraded, no exports for now" << endl;
+ return;
+ }
+
if (dir->inode->is_root()) {
dout(7) << "i won't export root" << endl;
assert(0);
}
// ok, let's go.
+ assert(export_state.count(dir) == 0);
+ export_state[dir] = EXPORT_DISCOVERING;
+ export_peer[dir] = dest;
// send ExportDirDiscover (ask target)
- export_gather[dir].insert(dest);
mds->send_message_mds(new MExportDirDiscover(dir->inode), dest, MDS_PORT_MIGRATOR);
dir->auth_pin(); // pin dir, to hang up our freeze (unpin on prep ack)
// take away the popularity we're sending. FIXME: do this later?
mds->balancer->subtract_export(dir);
-
// freeze the subtree
dir->freeze_tree(new C_MDC_ExportFreeze(this, dir, dest));
}
CDir *dir = in->dir;
assert(dir);
- int from = m->get_source().num();
- assert(export_gather[dir].count(from));
- export_gather[dir].erase(from);
+ dout(7) << "export_dir_discover_ack from " << m->get_source()
+ << " on " << *dir << ", releasing auth_pin" << endl;
- if (export_gather[dir].empty()) {
- dout(7) << "export_dir_discover_ack " << *dir << ", releasing auth_pin" << endl;
- dir->auth_unpin(); // unpin to allow freeze to complete
- } else {
- dout(7) << "export_dir_discover_ack " << *dir << ", still waiting for " << export_gather[dir] << endl;
- }
+ export_state[dir] = EXPORT_FREEZING;
+
+ dir->auth_unpin(); // unpin to allow freeze to complete
delete m; // done
}
+class C_MDC_ExportStartLogged : public Context {
+ Migrator *mig;
+ CDir *ex; // dir i'm exporting
+ int dest;
+ MExportDirPrep *prep;
+
+public:
+ C_MDC_ExportStartLogged(Migrator *m, CDir *e, int d, MExportDirPrep *p) :
+ mig(m), ex(e), dest(d), prep(p) {}
+ virtual void finish(int r) {
+ mig->export_dir_frozen_logged(ex, prep, dest);
+ }
+};
void Migrator::export_dir_frozen(CDir *dir,
int dest)
{
// subtree is now frozen!
dout(7) << "export_dir_frozen on " << *dir << " to " << dest << endl;
+ export_state[dir] = EXPORT_LOGGINGSTART;
show_imports();
+ EExportStart *le = new EExportStart(dir, dest);
MExportDirPrep *prep = new MExportDirPrep(dir->inode);
// include spanning tree for all nested exports.
// dir_auth updates on any nested exports are properly absorbed.
set<inodeno_t> inodes_added;
-
+
// include base dir
- prep->add_dir( new CDirDiscover(dir, dir->open_by_add(dest)) );
+ prep->add_dir( new CDirDiscover(dir, dir->add_replica(dest)) );
+ le->metablob.add_dir( dir, false );
// also include traces to all nested exports.
set<CDir*> my_nested;
dout(7) << " including nested export " << *exp << " in prep" << endl;
prep->add_export( exp->ino() );
+ le->get_bounds().insert(exp->ino());
+ le->metablob.add_dir_context( exp );
+ le->metablob.add_dir( exp, false );
/* first assemble each trace, in trace order, and put in message */
list<CInode*> inode_trace;
// include dir? note: this'll include everything except the nested exports themselves,
// since someone else is obviously auth.
if (cur->is_auth()) {
- prep->add_dir( new CDirDiscover(cur, cur->open_by_add(dest)) ); // yay!
+ prep->add_dir( new CDirDiscover(cur, cur->add_replica(dest)) ); // yay!
dout(7) << " added " << *cur << endl;
}
it++) {
CInode *in = *it;
dout(7) << " added " << *in << endl;
- prep->add_inode( in->parent->dir->ino(),
- in->parent->name,
+ prep->add_inode( in->parent->get_dir()->ino(),
+ in->parent->get_name(),
in->replicate_to(dest) );
}
}
- // send it!
+ // log our intentions
+ dout(7) << " logging EExportStart" << endl;
+ mds->mdlog->submit_entry(le, new C_MDC_ExportStartLogged(this, dir, dest, prep));
+}
+
+void Migrator::export_dir_frozen_logged(CDir *dir, MExportDirPrep *prep, int dest)
+{
+ dout(7) << "export_dir_frozen_logged " << *dir << endl;
+
+ if (export_state.count(dir) == 0 ||
+ export_state[dir] != EXPORT_LOGGINGSTART) {
+ // export must have aborted.
+ dout(7) << "export must have aborted, unfreezing and deleting me old prep message" << endl;
+ delete prep;
+ dir->unfreeze_tree(); // cancel the freeze
+ return;
+ }
+
+ export_state[dir] = EXPORT_PREPPING;
mds->send_message_mds(prep, dest, MDS_PORT_MIGRATOR);
}
dout(7) << "export_dir_prep_ack " << *dir << ", starting export" << endl;
+ if (export_state.count(dir) == 0 ||
+ export_state[dir] != EXPORT_PREPPING) {
+ // export must have aborted.
+ dout(7) << "export must have aborted, unfreezing" << endl;
+ dir->unfreeze_tree();
+ return;
+ }
+
// start export.
+ export_state[dir] = EXPORT_EXPORTING;
export_dir_go(dir, m->get_source().num());
// done
void Migrator::export_dir_go(CDir *dir,
- int dest)
+ int dest)
{
dout(7) << "export_dir_go " << *dir << " to " << dest << endl;
show_imports();
-
- // build export message
- MExportDir *req = new MExportDir(dir->inode); // include pop
-
+ assert(export_bounds.count(dir) == 0);
+ assert(export_data.count(dir) == 0);
// update imports/exports
CDir *containing_import = cache->get_auth_container(dir);
assert(dir->is_import());
cache->imports.erase(dir);
dir->state_clear(CDIR_STATE_IMPORT);
- dir->put(CDIR_PIN_IMPORT); // unpin, no longer an import
+ dir->put(CDir::PIN_IMPORT); // unpin, no longer an import
// discard nested exports (that we're handing off
for (set<CDir*>::iterator p = cache->nested_exports[dir].begin();
p++;
// add to export message
- req->add_export(nested);
+ export_bounds[dir].insert(nested);
// nested beneath our new export *in; remove!
dout(7) << " export " << *nested << " was nested beneath us; removing from export list(s)" << endl;
cache->nested_exports[containing_import].insert(dir);
dir->state_set(CDIR_STATE_EXPORT);
- dir->get(CDIR_PIN_EXPORT); // i must keep it pinned
+ dir->get(CDir::PIN_EXPORT); // i must keep it pinned
// discard nested exports (that we're handing off)
for (set<CDir*>::iterator p = cache->nested_exports[containing_import].begin();
// exports.erase(nested); _walk does this
// add to msg
- req->add_export(nested);
+ export_bounds[dir].insert(nested);
} else {
dout(12) << " export " << *nested << " is under other export " << *containing_export << ", which is unrelated" << endl;
assert(cache->get_auth_container(containing_export) != containing_import);
else
dir->set_dir_auth( dest );
+
// make list of nodes i expect an export_dir_notify_ack from
// (everyone w/ this dir open, but me!)
assert(export_notify_ack_waiting[dir].empty());
- for (set<int>::iterator it = dir->open_by.begin();
- it != dir->open_by.end();
+ for (map<int,int>::iterator it = dir->replicas_begin();
+ it != dir->replicas_end();
it++) {
- if (*it == mds->get_nodeid()) continue;
- export_notify_ack_waiting[dir].insert( *it );
+ if (it->first == mds->get_nodeid()) continue;
+ export_notify_ack_waiting[dir].insert( it->first );
// send warning to all but dest
- if (*it != dest) {
- dout(10) << " sending export_dir_warning to mds" << *it << endl;
- mds->send_message_mds(new MExportDirWarning( dir->ino() ), *it, MDS_PORT_MIGRATOR);
+ if (it->first != dest) {
+ dout(10) << " sending export_dir_warning to mds" << it->first << endl;
+ mds->send_message_mds(new MExportDirWarning( dir->ino() ), it->first, MDS_PORT_MIGRATOR);
}
}
assert(export_notify_ack_waiting[dir].count( dest ));
// fill export message with cache data
- C_Contexts *fin = new C_Contexts;
- int num_exported_inodes = export_dir_walk( req,
+ C_Contexts *fin = new C_Contexts; // collect all the waiters
+ int num_exported_inodes = encode_export_dir( export_data[dir],
fin,
dir, // base
dir, // recur start point
dest );
// send the export data!
+ MExportDir *req = new MExportDir(dir->ino());
+
+ // export state
+ req->set_dirstate( export_data[dir] );
+
+ // add bounds
+ for (set<CDir*>::iterator p = export_bounds[dir].begin();
+ p != export_bounds[dir].end();
+ ++p)
+ req->add_export((*p)->ino());
+
+ //s end
mds->send_message_mds(req, dest, MDS_PORT_MIGRATOR);
// queue up the finisher
*/
void Migrator::encode_export_inode(CInode *in, bufferlist& enc_state, int new_auth)
{
- in->inode.version++; // so local log entries are ignored, etc. (FIXME ??)
-
// tell (all) clients about migrating caps.. mark STALE
for (map<int, Capability>::iterator it = in->client_caps.begin();
it != in->client_caps.end();
it->second.pending(),
it->second.wanted(),
MClientFileCaps::FILECAP_STALE);
- mds->messenger->send_message(m, MSG_ADDR_CLIENT(it->first), mds->clientmap.get_inst(it->first),
+ mds->messenger->send_message(m, mds->clientmap.get_inst(it->first),
0, MDS_PORT_CACHE);
}
// relax locks?
- if (!in->is_cached_by_anyone())
+ if (!in->is_replicated())
in->replicate_relax_locks();
// add inode
- assert(in->cached_by.count(mds->get_nodeid()) == 0);
+ assert(!in->is_replica(mds->get_nodeid()));
CInodeExport istate( in );
istate._encode( enc_state );
if (in->is_dirty()) in->mark_clean();
// clear/unpin cached_by (we're no longer the authority)
- in->cached_by_clear();
+ in->clear_replicas();
// twiddle lock states for auth -> replica transition
// hard
// *** other state too?
// move to end of LRU so we drop out of cache quickly!
- cache->lru.lru_bottouch(in);
+ if (in->get_parent_dn())
+ cache->lru.lru_bottouch(in->get_parent_dn());
}
-int Migrator::export_dir_walk(MExportDir *req,
- C_Contexts *fin,
- CDir *basedir,
- CDir *dir,
- int newauth)
+int Migrator::encode_export_dir(list<bufferlist>& dirstatelist,
+ C_Contexts *fin,
+ CDir *basedir,
+ CDir *dir,
+ int newauth)
{
int num_exported = 0;
dout(7) << "export_dir_walk " << *dir << " " << dir->nitems << " items" << endl;
+ assert(dir->get_projected_version() == dir->get_version());
+
// dir
bufferlist enc_dir;
dstate._encode( enc_dir );
// release open_by
- dir->open_by_clear();
+ dir->clear_replicas();
// mark
assert(dir->is_auth());
// proxy
dir->state_set(CDIR_STATE_PROXY);
- dir->get(CDIR_PIN_PROXY);
+ dir->get(CDir::PIN_PROXY);
export_proxy_dirinos[basedir].push_back(dir->ino());
list<CDir*> subdirs;
CDir_map_t::iterator it;
for (it = dir->begin(); it != dir->end(); it++) {
CDentry *dn = it->second;
- CInode *in = dn->inode;
+ CInode *in = dn->get_inode();
num_exported++;
enc_dir.append("D", 1); // dirty
else
enc_dir.append("C", 1); // clean
-
+
+ version_t dnv = dn->get_version();
+ enc_dir.append((char*)&dnv, sizeof(dnv));
+
// null dentry?
if (dn->is_null()) {
enc_dir.append("N", 1); // null dentry
cache->exports.erase(in->dir); // discard nested export (nested_exports updated above)
in->dir->state_clear(CDIR_STATE_EXPORT);
- in->dir->put(CDIR_PIN_EXPORT);
+ in->dir->put(CDir::PIN_EXPORT);
// simplify dir_auth?
if (in->dir->get_dir_auth() == newauth)
// add to proxy
export_proxy_inos[basedir].push_back(in->ino());
- in->state_set(CINODE_STATE_PROXY);
- in->get(CINODE_PIN_PROXY);
+ in->state_set(CInode::STATE_PROXY);
+ in->get(CInode::PIN_PROXY);
// waiters
list<Context*> waiters;
}
}
- req->add_dir( enc_dir );
+ // add to dirstatelist
+ bufferlist bl;
+ dirstatelist.push_back( bl );
+ dirstatelist.back().claim( enc_dir );
// subdirs
for (list<CDir*>::iterator it = subdirs.begin(); it != subdirs.end(); it++)
- num_exported += export_dir_walk(req, fin, basedir, *it, newauth);
+ num_exported += encode_export_dir(dirstatelist, fin, basedir, *it, newauth);
return num_exported;
}
+class C_MDS_ExportFinishLogged : public Context {
+ Migrator *migrator;
+ CDir *dir;
+public:
+ C_MDS_ExportFinishLogged(Migrator *m, CDir *d) : migrator(m), dir(d) {}
+ void finish(int r) {
+ migrator->export_dir_finish(dir);
+ }
+};
+
+
/*
* i should get an export_dir_notify_ack from every mds that had me open, including the new auth (an ack)
*/
assert(export_notify_ack_waiting[dir].count(from));
export_notify_ack_waiting[dir].erase(from);
+ dout(7) << "handle_export_dir_notify_ack on " << *dir << " from " << from
+ << ", still need (" << export_notify_ack_waiting[dir] << ")" << endl;
+
// done?
- if (!export_notify_ack_waiting[dir].empty()) {
- dout(7) << "handle_export_dir_notify_ack on " << *dir << " from " << from
- << ", still waiting for " << export_notify_ack_waiting[dir] << endl;
-
+ if (export_notify_ack_waiting[dir].empty()) {
+ export_dir_acked(dir);
} else {
dout(7) << "handle_export_dir_notify_ack on " << *dir << " from " << from
- << ", last one!" << endl;
+ << ", still waiting for " << export_notify_ack_waiting[dir] << endl;
+ }
+
+ delete m;
+}
- // ok, we're finished!
- export_notify_ack_waiting.erase(dir);
- // finish export (unfreeze, trigger finish context, etc.)
- export_dir_finish(dir);
- // unpin proxies
- // inodes
- for (list<inodeno_t>::iterator it = export_proxy_inos[dir].begin();
- it != export_proxy_inos[dir].end();
- it++) {
- CInode *in = cache->get_inode(*it);
- in->put(CINODE_PIN_PROXY);
- assert(in->state_test(CINODE_STATE_PROXY));
- in->state_clear(CINODE_STATE_PROXY);
- }
- export_proxy_inos.erase(dir);
+/*
+ * this happens if hte dest failes after i send teh export data but before it is acked
+ * that is, we don't know they safely received and logged it, so we reverse our changes
+ * and go on.
+ */
+void Migrator::reverse_export(CDir *dir)
+{
+ dout(7) << "reverse_export " << *dir << endl;
+
+ assert(export_state[dir] == EXPORT_EXPORTING);
+ assert(export_bounds.count(dir));
+ assert(export_data.count(dir));
+
+ // re-import it.
+ set<CDir*> bounds;
+ bounds.swap(export_bounds[dir]);
+ export_bounds.erase(dir);
+
+ // -- adjust dir_auth --
+ // base
+ CDir *im = dir;
+ if (dir->get_inode()->authority() == mds->get_nodeid()) {
+ // parent is already me. was export, adding back to existing import.
+ im = mds->mdcache->get_auth_container(dir);
+ assert(im);
+ mds->mdcache->nested_exports[im].erase(dir);
+ mds->mdcache->exports.erase(dir);
+ dir->set_dir_auth( CDIR_AUTH_PARENT );
+ dir->state_clear(CDIR_STATE_EXPORT);
+ dir->put(CDir::PIN_EXPORT);
+ } else {
+ // parent isn't me. new import.
+ mds->mdcache->imports.insert(dir);
+ dir->set_dir_auth( mds->get_nodeid() );
+ dir->state_set(CDIR_STATE_IMPORT);
+ dir->get(CDir::PIN_IMPORT);
+ }
- // dirs
- for (list<inodeno_t>::iterator it = export_proxy_dirinos[dir].begin();
- it != export_proxy_dirinos[dir].end();
- it++) {
- CDir *dir = cache->get_inode(*it)->dir;
- dir->put(CDIR_PIN_PROXY);
- assert(dir->state_test(CDIR_STATE_PROXY));
- dir->state_clear(CDIR_STATE_PROXY);
-
- // hose neg dentries, too, since we're no longer auth
- CDir_map_t::iterator it;
- for (it = dir->begin(); it != dir->end(); ) {
- CDentry *dn = it->second;
- it++;
- if (dn->is_null()) {
- assert(dn->is_sync());
- dir->remove_dentry(dn);
- } else {
- //dout(10) << "export_dir_notify_ack leaving xlocked neg " << *dn << endl;
- if (dn->is_dirty())
- dn->mark_clean();
- }
- }
+ dout(10) << " base " << *dir << endl;
+ if (dir != im)
+ dout(10) << " under " << *im << endl;
+
+ // bounds
+ for (set<CDir*>::iterator p = bounds.begin();
+ p != bounds.end();
+ ++p) {
+ CDir *bd = *p;
+
+ if (bd->get_dir_auth() == mds->get_nodeid()) {
+ // still me. was an import.
+ mds->mdcache->imports.erase(bd);
+ bd->set_dir_auth( CDIR_AUTH_PARENT );
+ bd->state_clear(CDIR_STATE_IMPORT);
+ bd->put(CDir::PIN_IMPORT);
+ // move nested exports.
+ for (set<CDir*>::iterator q = mds->mdcache->nested_exports[bd].begin();
+ q != mds->mdcache->nested_exports[bd].end();
+ ++q)
+ mds->mdcache->nested_exports[im].insert(*q);
+ mds->mdcache->nested_exports.erase(bd);
+ } else {
+ // not me anymore. now an export.
+ mds->mdcache->exports.insert(bd);
+ mds->mdcache->nested_exports[im].insert(bd);
+ assert(bd->get_dir_auth() != CDIR_AUTH_PARENT);
+ bd->set_dir_auth( CDIR_AUTH_UNKNOWN );
+ bd->state_set(CDIR_STATE_EXPORT);
+ bd->get(CDir::PIN_EXPORT);
}
- export_proxy_dirinos.erase(dir);
+
+ dout(10) << " bound " << *bd << endl;
+ }
+
+ // reimport the dirs
+ list<inodeno_t> imported_subdirs;
+ int num_imported_inodes = 0;
+
+ for (list<bufferlist>::iterator p = export_data[dir].begin();
+ p != export_data[dir].end();
+ ++p) {
+ num_imported_inodes +=
+ decode_import_dir(*p,
+ export_peer[dir],
+ dir, // import root
+ imported_subdirs,
+ 0);
}
- delete m;
+ // remove proxy bits
+ clear_export_proxy_pins(dir);
+
+ // some clean up
+ export_data.erase(dir);
+ export_bounds.erase(dir);
+ export_notify_ack_waiting.erase(dir);
}
+void Migrator::export_dir_acked(CDir *dir)
+{
+ dout(7) << "export_dir_acked " << *dir << endl;
+ export_notify_ack_waiting.erase(dir);
+
+ export_state[dir] = EXPORT_LOGGINGFINISH;
+ export_data.erase(dir);
+ export_bounds.erase(dir);
+
+ // log export completion, then finish (unfreeze, trigger finish context, etc.)
+ mds->mdlog->submit_entry(new EExportFinish(dir, true),
+ new C_MDS_ExportFinishLogged(this, dir));
+}
+
+
/*
* once i get all teh notify_acks i can finish
*/
void Migrator::export_dir_finish(CDir *dir)
{
- // exported!
+ dout(7) << "export_dir_finish " << *dir << endl;
-
- // FIXME log it
-
- // send finish to new auth
- mds->send_message_mds(new MExportDirFinish(dir->ino()), dir->authority(), MDS_PORT_MIGRATOR);
-
+ if (export_state.count(dir)) {
+ // send finish/commit to new auth
+ mds->send_message_mds(new MExportDirFinish(dir->ino()), dir->authority(), MDS_PORT_MIGRATOR);
+
+ // remove from exporting list
+ export_state.erase(dir);
+ export_peer.erase(dir);
+ } else {
+ dout(7) << "target must have failed, not sending final commit message. export succeeded anyway." << endl;
+ }
+
// unfreeze
- dout(7) << "export_dir_finish " << *dir << ", unfreezing" << endl;
+ dout(7) << "export_dir_finish unfreezing" << endl;
dir->unfreeze_tree();
-
+
// unpin path
dout(7) << "export_dir_finish unpinning path" << endl;
vector<CDentry*> trace;
cache->make_trace(trace, dir->inode);
cache->path_unpin(trace, 0);
+ // unpin proxies
+ clear_export_proxy_pins(dir);
+
+ // queue finishers
+ mds->queue_finished(export_finish_waiters[dir]);
+ export_finish_waiters.erase(dir);
// stats
if (mds->logger) mds->logger->set("nex", cache->exports.size());
show_imports();
-}
-
+ // send pending import_maps?
+ mds->mdcache->send_pending_import_maps();
+}
+void Migrator::clear_export_proxy_pins(CDir *dir)
+{
+ dout(10) << "clear_export_proxy_pins " << *dir << endl;
+ // inodes
+ for (list<inodeno_t>::iterator it = export_proxy_inos[dir].begin();
+ it != export_proxy_inos[dir].end();
+ it++) {
+ CInode *in = cache->get_inode(*it);
+ dout(15) << " " << *in << endl;
+ in->put(CInode::PIN_PROXY);
+ assert(in->state_test(CInode::STATE_PROXY));
+ in->state_clear(CInode::STATE_PROXY);
+ }
+ export_proxy_inos.erase(dir);
+
+ // dirs
+ for (list<inodeno_t>::iterator it = export_proxy_dirinos[dir].begin();
+ it != export_proxy_dirinos[dir].end();
+ it++) {
+ CDir *dir = cache->get_inode(*it)->dir;
+ dout(15) << " " << *dir << endl;
+ dir->put(CDir::PIN_PROXY);
+ assert(dir->state_test(CDIR_STATE_PROXY));
+ dir->state_clear(CDIR_STATE_PROXY);
+
+ // hose neg dentries, too, since we're no longer auth
+ CDir_map_t::iterator it;
+ for (it = dir->begin(); it != dir->end(); ) {
+ CDentry *dn = it->second;
+ it++;
+ if (dn->is_null()) {
+ assert(dn->is_sync());
+ dir->remove_dentry(dn);
+ } else {
+ //dout(10) << "export_dir_notify_ack leaving xlocked neg " << *dn << endl;
+ if (dn->is_dirty())
+ dn->mark_clean();
+ }
+ }
+ }
+ export_proxy_dirinos.erase(dir);
+}
+// ==========================================================
+// IMPORT
-// IMPORTS
class C_MDC_ExportDirDiscover : public Context {
Migrator *mig;
}
// pin inode in the cache (for now)
- in->get(CINODE_PIN_IMPORTING);
+ in->get(CInode::PIN_IMPORTING);
// pin auth too, until the import completes.
in->auth_pin();
+
+ import_state[in->ino()] = IMPORT_DISCOVERED;
+ import_peer[in->ino()] = m->get_source().num();
+
// reply
dout(7) << " sending export_dir_discover_ack on " << *in << endl;
assert(!m->did_assim());
// open dir i'm importing.
- diri->set_dir( new CDir(diri, mds, false) );
+ diri->set_dir( new CDir(diri, mds->mdcache, false) );
dir = diri->dir;
m->get_dir(diri->ino())->update_dir(dir);
m->mark_assim(); // only do this the first time!
// move pin to dir
- diri->put(CINODE_PIN_IMPORTING);
- dir->get(CDIR_PIN_IMPORTING);
+ diri->put(CInode::PIN_IMPORTING);
+ dir->get(CDir::PIN_IMPORTING);
// auth pin too
dir->auth_pin();
diri->auth_unpin();
+
+ // change import state
+ import_state[diri->ino()] = IMPORT_PREPPING;
// assimilate traces to exports
for (list<CInodeDiscover*>::iterator it = m->get_inodes().begin();
m->get_dir(in->ino())->update_dir(in->dir);
dout(7) << " updated " << *in->dir << endl;
} else {
- in->set_dir( new CDir(in, mds, false) );
+ in->set_dir( new CDir(in, mds->mdcache, false) );
m->get_dir(in->ino())->update_dir(in->dir);
dout(7) << " added " << *in->dir << endl;
in->take_waiting(CINODE_WAIT_DIR, finished);
CInode *in = cache->get_inode(*it);
assert(in);
+ // note bound.
+ import_bounds[dir->ino()].insert(*it);
+
if (!in->dir) {
dout(7) << " opening nested export on " << *in << endl;
cache->open_remote_dir(in,
new C_MDS_RetryMessage(mds, m));
// pin it!
- in->get(CINODE_PIN_OPENINGDIR);
- in->state_set(CINODE_STATE_OPENINGDIR);
+ in->get(CInode::PIN_OPENINGDIR);
+ in->state_set(CInode::STATE_OPENINGDIR);
}
}
} else {
if (in->dir) {
if (!in->dir->state_test(CDIR_STATE_IMPORTINGEXPORT)) {
dout(7) << " pinning nested export " << *in->dir << endl;
- in->dir->get(CDIR_PIN_IMPORTINGEXPORT);
+ in->dir->get(CDir::PIN_IMPORTINGEXPORT);
in->dir->state_set(CDIR_STATE_IMPORTINGEXPORT);
- if (in->state_test(CINODE_STATE_OPENINGDIR)) {
- in->put(CINODE_PIN_OPENINGDIR);
- in->state_clear(CINODE_STATE_OPENINGDIR);
+ if (in->state_test(CInode::STATE_OPENINGDIR)) {
+ in->put(CInode::PIN_OPENINGDIR);
+ in->state_clear(CInode::STATE_OPENINGDIR);
}
} else {
dout(7) << " already pinned nested export " << *in << endl;
dout(7) << " all ready, sending export_dir_prep_ack on " << *dir << endl;
mds->send_message_mds(new MExportDirPrepAck(dir->ino()),
m->get_source().num(), MDS_PORT_MIGRATOR);
-
+
+ // note new state
+ import_state[diri->ino()] = IMPORT_PREPPED;
+
// done
delete m;
}
};
*/
-
+class C_MDS_ImportDirLoggedStart : public Context {
+ Migrator *migrator;
+ CDir *dir;
+ int from;
+ list<inodeno_t> imported_subdirs;
+ list<inodeno_t> exports;
+public:
+ C_MDS_ImportDirLoggedStart(Migrator *m, CDir *d, int f,
+ list<inodeno_t>& is, list<inodeno_t>& e) :
+ migrator(m), dir(d), from(f) {
+ imported_subdirs.swap(is);
+ exports.swap(e);
+ }
+ void finish(int r) {
+ migrator->import_dir_logged_start(dir, from, imported_subdirs, exports);
+ }
+};
void Migrator::handle_export_dir(MExportDir *m)
{
assert(dir);
int oldauth = m->get_source().num();
- dout(7) << "handle_export_dir, import " << *dir << " from " << oldauth << endl;
+ dout(7) << "handle_export_dir importing " << *dir << " from " << oldauth << endl;
assert(dir->is_auth() == false);
-
-
show_imports();
+ // start the journal entry
+ EImportStart *le = new EImportStart(dir->ino(), m->get_exports());
+ le->metablob.add_dir_context(dir);
+
// note new authority (locally)
- if (dir->inode->is_auth())
- dir->set_dir_auth( CDIR_AUTH_PARENT );
- else
- dir->set_dir_auth( mds->get_nodeid() );
- dout(10) << " set dir_auth to " << dir->get_dir_auth() << endl;
-
- // update imports/exports
- CDir *containing_import;
- if (cache->exports.count(dir)) {
- // reimporting
- dout(7) << " i'm reimporting " << *dir << endl;
- cache->exports.erase(dir);
-
+ CDir *im = dir;
+ if (dir->inode->is_auth()) {
+ // parent is already me. was export, adding back to existing import.
+ im = mds->mdcache->get_auth_container(dir);
+ assert(im);
+ mds->mdcache->nested_exports[im].erase(dir);
+ mds->mdcache->exports.erase(dir);
+ dir->set_dir_auth( CDIR_AUTH_PARENT );
dir->state_clear(CDIR_STATE_EXPORT);
- dir->put(CDIR_PIN_EXPORT); // unpin, no longer an export
-
- containing_import = cache->get_auth_container(dir);
- dout(7) << " it is nested under import " << *containing_import << endl;
- cache->nested_exports[containing_import].erase(dir);
+ dir->put(CDir::PIN_EXPORT);
} else {
- // new import
- cache->imports.insert(dir);
+ // parent isn't me. new import.
+ mds->mdcache->imports.insert(dir);
+ dir->set_dir_auth( mds->get_nodeid() );
dir->state_set(CDIR_STATE_IMPORT);
- dir->get(CDIR_PIN_IMPORT); // must keep it pinned
-
- containing_import = dir; // imported exports nested under *in
-
- dout(7) << " new import at " << *dir << endl;
+ dir->get(CDir::PIN_IMPORT);
}
-
// take out my temp pin
- dir->put(CDIR_PIN_IMPORTING);
+ dir->put(CDir::PIN_IMPORTING);
+
+ // mark import point frozen
+ // (note: this is a manual freeze.. hack hack hack!)
+ dir->get_inode()->auth_pin();
+ dir->state_set(CDIR_STATE_FROZENTREE);
- // add any inherited exports
+ dout(10) << " base " << *dir << endl;
+ if (dir != im)
+ dout(10) << " under " << *im << endl;
+
+ // bounds
for (list<inodeno_t>::iterator it = m->get_exports().begin();
it != m->get_exports().end();
it++) {
- CInode *exi = cache->get_inode(*it);
- assert(exi && exi->dir);
- CDir *ex = exi->dir;
+ CInode *bdi = cache->get_inode(*it);
+ CDir *bd = bdi->dir;
+
+ if (bd->get_dir_auth() == mds->get_nodeid()) {
+ // still me. was an import.
+ assert(bd->is_import());
+ mds->mdcache->imports.erase(bd);
+ bd->set_dir_auth( CDIR_AUTH_PARENT );
+ bd->state_clear(CDIR_STATE_IMPORT);
+ bd->put(CDir::PIN_IMPORT);
+ // move nested exports.
+ for (set<CDir*>::iterator q = mds->mdcache->nested_exports[bd].begin();
+ q != mds->mdcache->nested_exports[bd].end();
+ ++q)
+ mds->mdcache->nested_exports[im].insert(*q);
+ mds->mdcache->nested_exports.erase(bd);
+ } else {
+ // not me anymore. now an export.
+ mds->mdcache->exports.insert(bd);
+ mds->mdcache->nested_exports[im].insert(bd);
+ assert(bd->get_dir_auth() != CDIR_AUTH_PARENT);
+ bd->set_dir_auth( CDIR_AUTH_UNKNOWN );
+ bd->state_set(CDIR_STATE_EXPORT);
+ bd->get(CDir::PIN_EXPORT);
+ }
- dout(15) << " nested export " << *ex << endl;
+ // mark export point frozenleaf
+ bd->get(CDir::PIN_FREEZELEAF);
+ bd->state_set(CDIR_STATE_FROZENTREELEAF);
+ assert(import_bounds[dir->ino()].count(*it)); // we took note during prep stage
// remove our pin
- ex->put(CDIR_PIN_IMPORTINGEXPORT);
- ex->state_clear(CDIR_STATE_IMPORTINGEXPORT);
-
-
- // add...
- if (ex->is_import()) {
- dout(7) << " importing my import " << *ex << endl;
- cache->imports.erase(ex);
- ex->state_clear(CDIR_STATE_IMPORT);
-
- if (mds->logger) mds->logger->inc("imex");
+ bd->put(CDir::PIN_IMPORTINGEXPORT);
+ bd->state_clear(CDIR_STATE_IMPORTINGEXPORT);
- // move nested exports under containing_import
- for (set<CDir*>::iterator it = cache->nested_exports[ex].begin();
- it != cache->nested_exports[ex].end();
- it++) {
- dout(7) << " moving nested export " << **it << " under " << *containing_import << endl;
- cache->nested_exports[containing_import].insert(*it);
- }
- cache->nested_exports.erase(ex); // de-list under old import
-
- ex->set_dir_auth( CDIR_AUTH_PARENT );
- ex->put(CDIR_PIN_IMPORT); // imports are pinned, no longer import
-
- } else {
- dout(7) << " importing export " << *ex << endl;
-
- // add it
- ex->state_set(CDIR_STATE_EXPORT);
- ex->get(CDIR_PIN_EXPORT); // all exports are pinned
- cache->exports.insert(ex);
- cache->nested_exports[containing_import].insert(ex);
- if (mds->logger) mds->logger->inc("imex");
- }
-
+ dout(10) << " bound " << *bd << endl;
}
-
-
+
// add this crap to my cache
list<inodeno_t> imported_subdirs;
- bufferlist dir_state;
- dir_state.claim( m->get_state() );
- int off = 0;
int num_imported_inodes = 0;
- for (int i = 0; i < m->get_ndirs(); i++) {
+ for (list<bufferlist>::iterator p = m->get_dirstate().begin();
+ p != m->get_dirstate().end();
+ ++p) {
num_imported_inodes +=
- import_dir_block(dir_state,
- off,
+ decode_import_dir(*p,
oldauth,
dir, // import root
- imported_subdirs);
+ imported_subdirs,
+ le);
}
dout(10) << " " << imported_subdirs.size() << " imported subdirs" << endl;
dout(10) << " " << m->get_exports().size() << " imported nested exports" << endl;
// adjust popularity
mds->balancer->add_import(dir);
- // send notify's etc.
- dout(7) << "sending notifyack for " << *dir << " to old auth " << m->get_source().num() << endl;
- mds->send_message_mds(new MExportDirNotifyAck(dir->inode->ino()),
- m->get_source().num(), MDS_PORT_MIGRATOR);
+ dout(7) << "handle_export_dir did " << *dir << endl;
- dout(7) << "sending notify to others" << endl;
- for (set<int>::iterator it = dir->open_by.begin();
- it != dir->open_by.end();
- it++) {
- assert( *it != mds->get_nodeid() );
- if ( *it == m->get_source().num() ) continue; // not to old auth.
-
- MExportDirNotify *notify = new MExportDirNotify(dir->ino(), m->get_source().num(), mds->get_nodeid());
- notify->copy_exports(m->get_exports());
-
- if (g_conf.mds_verify_export_dirauth)
- notify->copy_subdirs(imported_subdirs); // copy subdir list (DEBUG)
-
- mds->send_message_mds(notify, *it, MDS_PORT_MIGRATOR);
- }
-
- // done
- delete m;
-
- show_imports();
-
-
- // is it empty?
- if (dir->get_size() == 0 &&
- !dir->inode->is_auth()) {
- // reexport!
- export_empty_import(dir);
- }
+ // log it
+ mds->mdlog->submit_entry(le,
+ new C_MDS_ImportDirLoggedStart(this, dir, m->get_source().num(),
+ imported_subdirs, m->get_exports()));
+ // note state
+ import_state[dir->ino()] = IMPORT_LOGGINGSTART;
// some stats
if (mds->logger) {
mds->logger->set("nim", cache->imports.size());
}
+ delete m;
+}
- // FIXME LOG IT
- /*
- stupid hashing crap, FIXME
+void Migrator::import_dir_logged_start(CDir *dir, int from,
+ list<inodeno_t> &imported_subdirs,
+ list<inodeno_t> &exports)
+{
+ dout(7) << "import_dir_logged " << *dir << endl;
- // wait for replicas in hashed dirs?
- if (import_hashed_replicate_waiting.count(m->get_ino())) {
- // it'll happen later!, when i get my inodegetreplicaack's back
- } else {
- // finish now
- //not anymoreimport_dir_finish(dir);
- }
- */
+ // note state
+ import_state[dir->ino()] = IMPORT_ACKING;
+
+ // send notify's etc.
+ dout(7) << "sending notifyack for " << *dir << " to old auth mds" << from << endl;
+ mds->send_message_mds(new MExportDirNotifyAck(dir->inode->ino()),
+ from, MDS_PORT_MIGRATOR);
+
+ dout(7) << "sending notify to others" << endl;
+ for (map<int,int>::iterator it = dir->replicas_begin();
+ it != dir->replicas_end();
+ it++) {
+ assert( it->first != mds->get_nodeid() );
+ if ( it->first == from ) continue; // not to old auth.
+
+ MExportDirNotify *notify = new MExportDirNotify(dir->ino(), from, mds->get_nodeid());
+ notify->copy_exports(exports);
+ if (g_conf.mds_verify_export_dirauth)
+ notify->copy_subdirs(imported_subdirs); // copy subdir list (DEBUG)
+
+ mds->send_message_mds(notify, it->first, MDS_PORT_MIGRATOR);
+ }
+
+ show_imports();
}
+class C_MDS_ImportDirLoggedFinish : public Context {
+ Migrator *migrator;
+ CDir *dir;
+public:
+ C_MDS_ImportDirLoggedFinish(Migrator *m, CDir *d) : migrator(m), dir(d) { }
+ void finish(int r) {
+ migrator->import_dir_logged_finish(dir);
+ }
+};
void Migrator::handle_export_dir_finish(MExportDirFinish *m)
{
CDir *dir = diri->dir;
assert(dir);
- dout(7) << "handle_export_dir_finish on " << *dir << endl;
+ dout(7) << "handle_export_dir_finish logging import_finish on " << *dir << endl;
assert(dir->is_auth());
- dout(5) << "done with import of " << *dir << endl;
- show_imports();
- if (mds->logger) {
- mds->logger->set("nex", cache->exports.size());
- mds->logger->set("nim", cache->imports.size());
- }
+ // note state
+ import_state[dir->ino()] = IMPORT_LOGGINGFINISH;
+
+ // log
+ mds->mdlog->submit_entry(new EImportFinish(dir, true),
+ new C_MDS_ImportDirLoggedFinish(this,dir));
+ delete m;
+}
+
+void Migrator::import_dir_logged_finish(CDir *dir)
+{
+ dout(7) << "import_dir_logged_finish " << *dir << endl;
// un auth pin (other exports can now proceed)
dir->auth_unpin();
+ // unfreeze!
+ for (set<inodeno_t>::iterator p = import_bounds[dir->ino()].begin();
+ p != import_bounds[dir->ino()].end();
+ ++p) {
+ CInode *diri = mds->mdcache->get_inode(*p);
+ CDir *dir = diri->dir;
+ assert(dir->state_test(CDIR_STATE_FROZENTREELEAF));
+ dir->put(CDir::PIN_FREEZELEAF);
+ dir->state_clear(CDIR_STATE_FROZENTREELEAF);
+ }
+
+ dir->unfreeze_tree();
+
+ // clear import state (we're done!)
+ import_state.erase(dir->ino());
+ import_peer.erase(dir->ino());
+ import_bounds.erase(dir->ino());
+
// ok now finish contexts
dout(5) << "finishing any waiters on imported data" << endl;
dir->finish_waiting(CDIR_WAIT_IMPORTED);
- delete m;
+ // log it
+ if (mds->logger) {
+ mds->logger->set("nex", cache->exports.size());
+ mds->logger->set("nim", cache->imports.size());
+ }
+ show_imports();
+
+ // is it empty?
+ if (dir->get_size() == 0 &&
+ !dir->inode->is_auth()) {
+ // reexport!
+ export_empty_import(dir);
+ }
}
in->set_auth(true);
}
- // link before state
+ // state after link -- or not! -sage
+ set<int> merged_client_caps;
+ istate.update_inode(in, merged_client_caps);
+
+ // link before state -- or not! -sage
if (dn->inode != in) {
assert(!dn->inode);
dn->dir->link_inode(dn, in);
}
-
- // state after link
- set<int> merged_client_caps;
- istate.update_inode(in, merged_client_caps);
-
// add inode?
if (added) {
}
- // cached_by
- assert(!in->is_cached_by(oldauth));
- in->cached_by_add( oldauth, CINODE_EXPORT_NONCE );
- if (in->is_cached_by(mds->get_nodeid()))
- in->cached_by_remove(mds->get_nodeid());
+ // adjust replica list
+ //assert(!in->is_replica(oldauth)); // not true on failed export
+ in->add_replica( oldauth, CINODE_EXPORT_NONCE );
+ if (in->is_replica(mds->get_nodeid()))
+ in->remove_replica(mds->get_nodeid());
// twiddle locks
// hard
MClientFileCaps::FILECAP_REAP);
caps->set_mds( oldauth ); // reap from whom?
mds->messenger->send_message(caps,
- MSG_ADDR_CLIENT(*it), mds->clientmap.get_inst(*it),
+ mds->clientmap.get_inst(*it),
0, MDS_PORT_CACHE);
}
if (in->filelock.gather_set.empty()) // necessary but not suffient...
mds->locker->inode_file_eval(in);
}
-
- // other
- if (in->is_dirty()) {
- dout(10) << "logging dirty import " << *in << endl;
- mds->mdlog->submit_entry(new EInodeUpdate(in));
- }
}
-int Migrator::import_dir_block(bufferlist& bl,
- int& off,
- int oldauth,
- CDir *import_root,
- list<inodeno_t>& imported_subdirs)
+int Migrator::decode_import_dir(bufferlist& bl,
+ int oldauth,
+ CDir *import_root,
+ list<inodeno_t>& imported_subdirs,
+ EImportStart *le)
{
+ int off = 0;
+
// set up dir
CDirExport dstate;
off = dstate._decode(bl, off);
-
+
CInode *diri = cache->get_inode(dstate.get_ino());
assert(diri);
- CDir *dir = diri->get_or_open_dir(mds);
+ CDir *dir = diri->get_or_open_dir(mds->mdcache);
assert(dir);
-
- dout(7) << " import_dir_block " << *dir << " have " << dir->nitems << " items, importing " << dstate.get_nden() << " dentries" << endl;
+
+ dout(7) << "decode_import_dir " << *dir << endl;
// add to list
if (dir != import_root)
// assimilate state
dstate.update_dir( dir );
- if (diri->is_auth())
- dir->set_dir_auth( CDIR_AUTH_PARENT ); // update_dir may hose dir_auth
// mark (may already be marked from get_or_open_dir() above)
if (!dir->is_auth())
dir->state_set(CDIR_STATE_AUTH);
- // open_by
- assert(!dir->is_open_by(oldauth));
- dir->open_by_add(oldauth);
- if (dir->is_open_by(mds->get_nodeid()))
- dir->open_by_remove(mds->get_nodeid());
+ // adjust replica list
+ //assert(!dir->is_replica(oldauth)); // not true on failed export
+ dir->add_replica(oldauth);
+ if (dir->is_replica(mds->get_nodeid()))
+ dir->remove_replica(mds->get_nodeid());
+
+ // add to journal entry
+ if (le)
+ le->metablob.add_dir(dir, true); // Hmm: false would be okay in some cases
+
+ int num_imported = 0;
if (dir->is_hashed()) {
// do nothing; dir is hashed
- return 0;
} else {
// take all waiters on this dir
// NOTE: a pass of imported data is guaranteed to get all of my waiters because
dout(15) << "doing contents" << endl;
// contents
- int num_imported = 0;
long nden = dstate.get_nden();
for (; nden>0; nden--) {
char dirty;
bl.copy(off, 1, &dirty);
off++;
+
+ version_t dnv;
+ bl.copy(off, sizeof(dnv), (char*)&dnv);
+ off += sizeof(dnv);
char icode;
bl.copy(off, 1, &icode);
CDentry *dn = dir->lookup(dname);
if (!dn)
dn = dir->add_dentry(dname); // null
+
+ // mark dentry dirty?
+ if (dirty == 'D')
+ dn->_mark_dirty();
- // mark dn dirty _after_ we link the inode (scroll down)
-
+ dn->set_version( dnv );
+ dn->set_projected_version( dnv );
+
if (icode == 'N') {
// null dentry
assert(dn->is_null());
// inode
decode_import_inode(dn, bl, off, oldauth);
}
-
- // mark dentry dirty? (only _after_ we link the inode!)
- if (dirty == 'D') dn->mark_dirty();
-
- }
- if (dir->is_dirty())
- mds->mdlog->submit_entry(new EDirUpdate(dir));
+ // add dentry to journal entry
+ if (le)
+ le->metablob.add_dentry(dn, true); // Hmm: might we do dn->is_dirty() here instead?
+ }
- return num_imported;
}
+
+ dout(7) << "decode_import_dir done " << *dir << endl;
+ return num_imported;
}
// fix up subdir export?
if (dn->inode->dir) {
assert(dn->inode->dir->state_test(CDIR_STATE_IMPORTINGEXPORT));
- dn->inode->dir->put(CDIR_PIN_IMPORTINGEXPORT);
+ dn->inode->dir->put(CDir::PIN_IMPORTINGEXPORT);
dn->inode->dir->state_clear(CDIR_STATE_IMPORTINGEXPORT);
if (dn->inode->dir->is_auth()) {
dout(7) << "unimporting subdir now that inode is mine " << *dn->inode->dir << endl;
dn->inode->dir->set_dir_auth( CDIR_AUTH_PARENT );
cache->imports.erase(dn->inode->dir);
- dn->inode->dir->put(CDIR_PIN_IMPORT);
+ dn->inode->dir->put(CDir::PIN_IMPORT);
dn->inode->dir->state_clear(CDIR_STATE_IMPORT);
// move nested under hashdir
else {
// not mine. make it an export.
dout(7) << "making subdir into export " << *dn->inode->dir << endl;
- dn->inode->dir->get(CDIR_PIN_EXPORT);
+ dn->inode->dir->get(CDir::PIN_EXPORT);
dn->inode->dir->state_set(CDIR_STATE_EXPORT);
cache->exports.insert(dn->inode->dir);
cache->nested_exports[dir].insert(dn->inode->dir);
}
// mark dentry dirty? (only _after_ we link the inode!)
- dn->mark_dirty();
+ dn->_mark_dirty(); // fixme
}
}
- remember simple rule: dir auth follows inode, unless dir_auth is explicit.
- - export_dir_walk and import_dir_block take care with dir_auth: (for import/export)
+ - export_dir_walk and decode_import_dir take care with dir_auth: (for import/export)
- on export, -1 is changed to mds->get_nodeid()
- on import, nothing special, actually.
// ok, go
dir->state_set(CDIR_STATE_HASHING);
- dir->get(CDIR_PIN_HASHING);
+ dir->get(CDir::PIN_HASHING);
assert(dir->hashed_subset.empty());
// discover on all mds
it != dir->end();
it++) {
CInode *in = it->second->inode;
- in->mark_dirty();
+ in->_mark_dirty(); // fixme
}
if (dir->is_frozen_dir())
if (!in->is_dir()) continue;
if (!in->dir) continue;
- int dentryhashcode = mds->hash_dentry( dir->ino(), it->first );
+ int dentryhashcode = mds->mdcache->hash_dentry( dir->ino(), it->first );
if (dentryhashcode == mds->get_nodeid()) continue;
// msg?
CDentry *dn = it->second;
CInode *in = dn->inode;
- int dentryhashcode = mds->hash_dentry( dir->ino(), it->first );
+ int dentryhashcode = mds->mdcache->hash_dentry( dir->ino(), it->first );
if (dentryhashcode == mds->get_nodeid()) {
continue; // still mine!
}
// add to proxy
hash_proxy_inos[dir].push_back(in);
- in->state_set(CINODE_STATE_PROXY);
- in->get(CINODE_PIN_PROXY);
+ in->state_set(CInode::STATE_PROXY);
+ in->get(CInode::PIN_PROXY);
// fix up subdirs
if (in->dir) {
dout(7) << "making subdir into import " << *in->dir << endl;
in->dir->set_dir_auth( mds->get_nodeid() );
cache->imports.insert(in->dir);
- in->dir->get(CDIR_PIN_IMPORT);
+ in->dir->get(CDir::PIN_IMPORT);
in->dir->state_set(CDIR_STATE_IMPORT);
// fix nested bits
// not mine.
dout(7) << "un-exporting subdir that's being hashed away " << *in->dir << endl;
assert(in->dir->is_export());
- in->dir->put(CDIR_PIN_EXPORT);
+ in->dir->put(CDir::PIN_EXPORT);
in->dir->state_clear(CDIR_STATE_EXPORT);
cache->exports.erase(in->dir);
cache->nested_exports[containing_import].erase(in->dir);
// dir state
dir->state_set(CDIR_STATE_HASHED);
- dir->get(CDIR_PIN_HASHED);
+ dir->get(CDir::PIN_HASHED);
cache->hashdirs.insert(dir);
- dir->mark_dirty();
- mds->mdlog->submit_entry(new EDirUpdate(dir));
+ dir->mark_dirty(dir->pre_dirty()); // fixme
+ mds->mdlog->submit_entry(new EString("dirty dir fixme"));
// inode state
if (dir->inode->is_auth()) {
- dir->inode->mark_dirty();
- mds->mdlog->submit_entry(new EInodeUpdate(dir->inode));
+ dir->inode->_mark_dirty(); // fixme
+ mds->mdlog->submit_entry(new EString("hash dirty fixme"));
}
// fix up nested_exports?
// dir state
hash_gather.erase(dir);
dir->state_clear(CDIR_STATE_HASHING);
- dir->put(CDIR_PIN_HASHING);
+ dir->put(CDir::PIN_HASHING);
dir->hashed_subset.clear();
// unproxy inodes
it != hash_proxy_inos[dir].end();
it++) {
CInode *in = *it;
- assert(in->state_test(CINODE_STATE_PROXY));
- in->state_clear(CINODE_STATE_PROXY);
- in->put(CINODE_PIN_PROXY);
+ assert(in->state_test(CInode::STATE_PROXY));
+ in->state_clear(CInode::STATE_PROXY);
+ in->put(CInode::PIN_PROXY);
}
hash_proxy_inos.erase(dir);
if (!in) continue;
if (!in->dir) continue;
- int dentryhashcode = mds->hash_dentry( dir->ino(), it->first );
+ int dentryhashcode = mds->mdcache->hash_dentry( dir->ino(), it->first );
if (dentryhashcode != from) continue; // we'll import these in a minute
if (in->dir->authority() != dentryhashcode)
hash_gather.erase(dir);
dir->state_clear(CDIR_STATE_HASHING);
- dir->put(CDIR_PIN_HASHING);
+ dir->put(CDir::PIN_HASHING);
dir->hashed_subset.clear();
} else {
dout(7) << "still waiting for notify from " << hash_gather[dir] << endl;
// pin dir, set hashing flag
dir->state_set(CDIR_STATE_HASHING);
- dir->get(CDIR_PIN_HASHING);
+ dir->get(CDir::PIN_HASHING);
assert(dir->hashed_subset.empty());
// inode state
dir->inode->inode.hash_seed = 1;// dir->ino();
if (dir->inode->is_auth()) {
- dir->inode->mark_dirty();
- mds->mdlog->submit_entry(new EInodeUpdate(dir->inode));
+ dir->inode->_mark_dirty(); // fixme
+ mds->mdlog->submit_entry(new EString("hash dirty fixme"));
}
// get gather set ready for notifies
if (in->dir) {
if (!in->dir->state_test(CDIR_STATE_IMPORTINGEXPORT)) {
dout(5) << " pinning nested export " << *in->dir << endl;
- in->dir->get(CDIR_PIN_IMPORTINGEXPORT);
+ in->dir->get(CDir::PIN_IMPORTINGEXPORT);
in->dir->state_set(CDIR_STATE_IMPORTINGEXPORT);
} else {
dout(5) << " already pinned nested export " << *in << endl;
// dir state
dir->state_set(CDIR_STATE_HASHED);
- dir->get(CDIR_PIN_HASHED);
+ dir->get(CDir::PIN_HASHED);
cache->hashdirs.insert(dir);
dir->hashed_subset.insert(mds->get_nodeid());
// dir is complete
dir->mark_complete();
- dir->mark_dirty();
- mds->mdlog->submit_entry(new EDirUpdate(dir));
+ dir->mark_dirty(dir->pre_dirty()); // fixme
+ mds->mdlog->submit_entry(new EString("dirty dir fixme"));
// commit
mds->mdstore->commit_dir(dir, 0);
if (in->dir) {
if (!in->dir->state_test(CDIR_STATE_IMPORTINGEXPORT)) {
dout(5) << " pinning nested export " << *in->dir << endl;
- in->dir->get(CDIR_PIN_IMPORTINGEXPORT);
+ in->dir->get(CDir::PIN_IMPORTINGEXPORT);
in->dir->state_set(CDIR_STATE_IMPORTINGEXPORT);
} else {
dout(5) << " already pinned nested export " << *in << endl;
// dir state
//dir->state_clear(CDIR_STATE_UNHASHING); //later
dir->state_clear(CDIR_STATE_HASHED);
- dir->put(CDIR_PIN_HASHED);
+ dir->put(CDir::PIN_HASHED);
cache->hashdirs.erase(dir);
// commit!
assert(dir->is_complete());
//dir->mark_complete();
- dir->mark_dirty();
+ dir->mark_dirty(dir->pre_dirty()); // fixme
mds->mdstore->commit_dir(dir, 0);
// inode state
dir->inode->inode.hash_seed = 0;
if (dir->inode->is_auth()) {
- dir->inode->mark_dirty();
- mds->mdlog->submit_entry(new EInodeUpdate(dir->inode));
+ dir->inode->_mark_dirty(); // fixme
+ mds->mdlog->submit_entry(new EString("hash inode dirty fixme"));
}
// notify
it++) {
CInode *in = it->second->inode;
if (in->is_auth()) {
- in->mark_dirty();
- mds->mdlog->submit_entry(new EInodeUpdate(in));
+ in->_mark_dirty(); // fixme
+ mds->mdlog->submit_entry(new EString("unhash dirty fixme"));
}
}
if (!in->is_dir()) continue;
if (!in->dir) continue;
- int dentryhashcode = mds->hash_dentry( dir->ino(), it->first );
+ int dentryhashcode = mds->mdcache->hash_dentry( dir->ino(), it->first );
if (dentryhashcode != mds->get_nodeid()) continue;
// msg?
CDentry *dn = it->second;
CInode *in = dn->inode;
- int dentryhashcode = mds->hash_dentry( dir->ino(), it->first );
+ int dentryhashcode = mds->mdcache->hash_dentry( dir->ino(), it->first );
if (dentryhashcode != mds->get_nodeid()) {
// not mine!
// twiddle dir_auth?
dn->mark_clean();
// proxy
- in->state_set(CINODE_STATE_PROXY);
- in->get(CINODE_PIN_PROXY);
+ in->state_set(CInode::STATE_PROXY);
+ in->get(CInode::PIN_PROXY);
hash_proxy_inos[dir].push_back(in);
if (in->dir) {
dout(7) << "making subdir into import " << *in->dir << endl;
in->dir->set_dir_auth( mds->get_nodeid() );
cache->imports.insert(in->dir);
- in->dir->get(CDIR_PIN_IMPORT);
+ in->dir->get(CDir::PIN_IMPORT);
in->dir->state_set(CDIR_STATE_IMPORT);
}
else {
// not mine.
dout(7) << "un-exporting subdir that's being unhashed away " << *in->dir << endl;
assert(in->dir->is_export());
- in->dir->put(CDIR_PIN_EXPORT);
+ in->dir->put(CDir::PIN_EXPORT);
in->dir->state_clear(CDIR_STATE_EXPORT);
cache->exports.erase(in->dir);
cache->nested_exports[dir].erase(in->dir);
// dir state
//dir->state_clear(CDIR_STATE_UNHASHING); // later
dir->state_clear(CDIR_STATE_HASHED);
- dir->put(CDIR_PIN_HASHED);
+ dir->put(CDir::PIN_HASHED);
cache->hashdirs.erase(dir);
dir->mark_clean();
// inode state
dir->inode->inode.hash_seed = 0;
if (dir->inode->is_auth()) {
- dir->inode->mark_dirty();
- mds->mdlog->submit_entry(new EInodeUpdate(dir->inode));
+ dir->inode->_mark_dirty(); // fixme
+ mds->mdlog->submit_entry(new EString("unhash inode dirty fixme"));
}
// init gather set
- hash_gather[dir] = mds->get_mds_map()->get_mds();
+ mds->get_mds_map()->get_active_mds_set( hash_gather[dir] );
hash_gather[dir].erase(mds->get_nodeid());
// send unhash message
it != hash_proxy_inos[dir].end();
it++) {
CInode *in = *it;
- assert(in->state_test(CINODE_STATE_PROXY));
- in->state_clear(CINODE_STATE_PROXY);
- in->put(CINODE_PIN_PROXY);
+ assert(in->state_test(CInode::STATE_PROXY));
+ in->state_clear(CInode::STATE_PROXY);
+ in->put(CInode::PIN_PROXY);
}
// unfreeze
class CInode;
class CDentry;
+class MExportDir;
class MExportDirDiscover;
class MExportDirDiscoverAck;
class MExportDirPrep;
class MExportDirPrepAck;
class MExportDirWarning;
-class MExportDir;
class MExportDirNotify;
class MExportDirNotifyAck;
class MExportDirFinish;
class MUnhashDirNotify;
class MUnhashDirNotifyAck;
+class EImportStart;
+
class Migrator {
private:
MDS *mds;
MDCache *cache;
+ // -- exports --
+ // export stages. used to clean up intelligently if there's a failure.
+ const static int EXPORT_DISCOVERING = 1; // dest is disovering export dir
+ const static int EXPORT_FREEZING = 2; // we're freezing the dir tree
+ const static int EXPORT_LOGGINGSTART = 3; // we're logging EExportStart
+ const static int EXPORT_PREPPING = 4; // sending dest spanning tree to export bounds
+ const static int EXPORT_EXPORTING = 5; // sent actual export, waiting for acks
+ const static int EXPORT_LOGGINGFINISH = 6; // logging EExportFinish
+
// export fun
- map<CDir*, set<int> > export_notify_ack_waiting; // nodes i am waiting to get export_notify_ack's from
+ map<CDir*, int> export_state;
+ map<CDir*, int> export_peer;
+ map<CDir*, set<CDir*> > export_bounds;
+ map<CDir*, list<bufferlist> > export_data; // only during EXPORTING state
+ map<CDir*, set<int> > export_notify_ack_waiting; // nodes i am waiting to get export_notify_ack's from
map<CDir*, list<inodeno_t> > export_proxy_inos;
map<CDir*, list<inodeno_t> > export_proxy_dirinos;
+ map<CDir*, list<Context*> > export_finish_waiters;
+
set<inodeno_t> stray_export_warnings; // notifies i haven't seen
map<inodeno_t, MExportDirNotify*> stray_export_notifies;
- // hashing madness
+
+ // -- imports --
+ const static int IMPORT_DISCOVERED = 1; // waiting for prep
+ const static int IMPORT_PREPPING = 2; // opening dirs on bounds
+ const static int IMPORT_PREPPED = 3; // opened bounds, waiting for import
+ const static int IMPORT_LOGGINGSTART = 4; // got import, logging EImportStart
+ const static int IMPORT_ACKING = 5; // logged, sent acks
+ const static int IMPORT_LOGGINGFINISH = 6;
+
+ map<inodeno_t,int> import_state;
+ map<inodeno_t,int> import_peer;
+ map<inodeno_t,set<inodeno_t> > import_bounds;
+
+
+ // -- hashing madness --
multimap<CDir*, int> unhash_waiting; // nodes i am waiting for UnhashDirAck's from
multimap<inodeno_t, inodeno_t> import_hashed_replicate_waiting; // nodes i am waiting to discover to complete my import of a hashed dir
// maps frozen_dir_ino's to waiting-for-discover ino's.
multimap<inodeno_t, inodeno_t> import_hashed_frozen_waiting; // dirs i froze (for the above)
-
+
+
+
public:
// -- cons --
Migrator(MDS *m, MDCache *c) : mds(m), cache(c) {}
void dispatch(Message*);
+
+ // -- status --
+ int is_exporting(CDir *dir) {
+ if (export_state.count(dir)) return export_state[dir];
+ return 0;
+ }
+ bool is_exporting() { return !export_state.empty(); }
+ int is_importing(inodeno_t dirino) {
+ if (import_state.count(dirino)) return import_state[dirino];
+ return 0;
+ }
+ bool is_importing() { return !import_state.empty(); }
+ const set<inodeno_t>& get_import_bounds(inodeno_t base) {
+ assert(import_bounds.count(base));
+ return import_bounds[base];
+ }
+
+
+ // -- misc --
+ void handle_mds_failure(int who);
+ void show_imports();
+
+
// -- import/export --
// exporter
public:
void encode_export_inode(CInode *in, bufferlist& enc_state, int newauth);
void decode_import_inode(CDentry *dn, bufferlist& bl, int &off, int oldauth);
+ void add_export_finish_waiter(CDir *dir, Context *c) {
+ export_finish_waiters[dir].push_back(c);
+ }
+ void clear_export_proxy_pins(CDir *dir);
+
protected:
- map< CDir*, set<int> > export_gather;
void handle_export_dir_discover_ack(MExportDirDiscoverAck *m);
void export_dir_frozen(CDir *dir, int dest);
+ void export_dir_frozen_logged(CDir *dir, MExportDirPrep *prep, int dest);
void handle_export_dir_prep_ack(MExportDirPrepAck *m);
void export_dir_go(CDir *dir,
int dest);
- int export_dir_walk(MExportDir *req,
+ int encode_export_dir(list<bufferlist>& dirstatelist,
class C_Contexts *fin,
CDir *basedir,
CDir *dir,
int newauth);
- void export_dir_finish(CDir *dir);
void handle_export_dir_notify_ack(MExportDirNotifyAck *m);
-
-
- friend class C_MDC_ExportFreeze;
+ void reverse_export(CDir *dir);
+ void export_dir_acked(CDir *dir);
+ void export_dir_finish(CDir *dir);
+ friend class C_MDC_ExportFreeze;
+ friend class C_MDC_ExportStartLogged;
+ friend class C_MDS_ExportFinishLogged;
// importer
void handle_export_dir_discover(MExportDirDiscover *m);
void handle_export_dir_discover_2(MExportDirDiscover *m, CInode *in, int r);
void handle_export_dir_prep(MExportDirPrep *m);
void handle_export_dir(MExportDir *m);
- void import_dir_finish(CDir *dir);
+ void import_dir_logged_start(CDir *dir, int from,
+ list<inodeno_t> &imported_subdirs,
+ list<inodeno_t> &exports);
+ void import_dir_logged_finish(CDir *dir);
void handle_export_dir_finish(MExportDirFinish *m);
- int import_dir_block(bufferlist& bl,
- int& off,
- int oldauth,
- CDir *import_root,
- list<inodeno_t>& imported_subdirs);
+ int decode_import_dir(bufferlist& bl,
+ int oldauth,
+ CDir *import_root,
+ list<inodeno_t>& imported_subdirs,
+ EImportStart *le);
void got_hashed_replica(CDir *import,
inodeno_t dir_ino,
inodeno_t replica_ino);
-
friend class C_MDC_ExportDirDiscover;
+ friend class C_MDS_ImportDirLoggedStart;
+ friend class C_MDS_ImportDirLoggedFinish;
// bystander
void handle_export_dir_warning(MExportDirWarning *m);
void handle_export_dir_notify(MExportDirNotify *m);
- void show_imports();
// -- hashed directories --
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-
-
-#include "OSDMonitor.h"
-
-#include "osd/OSDMap.h"
-
-#include "msg/Message.h"
-#include "msg/Messenger.h"
-
-#include "messages/MPing.h"
-#include "messages/MPingAck.h"
-#include "messages/MOSDFailure.h"
-#include "messages/MOSDMap.h"
-#include "messages/MOSDGetMap.h"
-#include "messages/MOSDBoot.h"
-#include "messages/MOSDIn.h"
-#include "messages/MOSDOut.h"
-
-#include "common/Timer.h"
-#include "common/Clock.h"
-
-#include "config.h"
-#undef dout
-#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) cout << "mon" << whoami << " e" << (osdmap ? osdmap->get_epoch():0) << " "
-#define derr(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) cerr << "mon" << whoami << " e" << (osdmap ? osdmap->get_epoch():0) << " "
-
-
-class C_OM_PingTick : public Context {
-public:
- Messenger *msgr;
- C_OM_PingTick(Messenger *m) : msgr(m) {}
- void finish(int r) {
- msgr->send_message(new MPing, MSG_ADDR_MON(0));
- }
-};
-
-class C_OM_Faker : public Context {
-public:
- OSDMonitor *om;
- C_OM_Faker(OSDMonitor *m) {
- this->om = m;
- }
- void finish(int r) {
- om->fake_reorg();
- }
-};
-
-class C_OM_FakeOSDFailure : public Context {
- OSDMonitor *mon;
- int osd;
- bool down;
-public:
- C_OM_FakeOSDFailure(OSDMonitor *m, int o, bool d) : mon(m), osd(o), down(d) {}
- void finish(int r) {
- mon->fake_osd_failure(osd,down);
- }
-};
-
-
-
-void OSDMonitor::fake_osdmap_update()
-{
- dout(1) << "fake_osdmap_update" << endl;
- accept_pending();
-
- // tell a random osd
- send_incremental_map(osdmap->get_epoch()-1, // ick! FIXME
- MSG_ADDR_OSD(rand() % g_conf.num_osd));
-}
-
-
-void OSDMonitor::fake_reorg()
-{
- int r = rand() % g_conf.num_osd;
-
- if (osdmap->is_out(r)) {
- dout(1) << "fake_reorg marking osd" << r << " in" << endl;
- pending.new_in.push_back(r);
- } else {
- dout(1) << "fake_reorg marking osd" << r << " out" << endl;
- pending.new_out.push_back(r);
- }
-
- accept_pending();
-
- // tell him!
- send_incremental_map(osdmap->get_epoch()-1, MSG_ADDR_OSD(r));
-}
-
-
-void OSDMonitor::init()
-{
- dout(1) << "init" << endl;
-
-
- // <HACK set up OSDMap from g_conf>
- osdmap = new OSDMap();
- osdmap->set_pg_bits(g_conf.osd_pg_bits);
-
- // start at epoch 0 until all osds boot
- //osdmap->inc_epoch(); // = 1
- //assert(osdmap->get_epoch() == 1);
-
-
- //if (g_conf.mkfs) osdmap->set_mkfs();
-
- Bucket *b = new UniformBucket(1, 0);
- int root = osdmap->crush.add_bucket(b);
- for (int i=0; i<g_conf.num_osd; i++) {
- osdmap->osds.insert(i);
- b->add_item(i, 1);
- }
-
- for (int i=1; i<5; i++) {
- osdmap->crush.rules[i].steps.push_back(RuleStep(CRUSH_RULE_TAKE, root));
- osdmap->crush.rules[i].steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, i, 0));
- osdmap->crush.rules[i].steps.push_back(RuleStep(CRUSH_RULE_EMIT));
- }
-
- if (g_conf.mds_local_osd) {
- // add mds osds, but don't put them in the crush mapping func
- for (int i=0; i<g_conf.num_mds; i++)
- osdmap->osds.insert(i+10000);
- }
-
- // </HACK>
-
-
-
- if (whoami == 0 &&
- g_conf.num_osd > 4 &&
- g_conf.fake_osdmap_expand) {
- dout(1) << "scheduling OSD map reorg at " << g_conf.fake_osdmap_expand << endl;
- g_timer.add_event_after(g_conf.fake_osdmap_expand,
- new C_OM_Faker(this));
- }
-
- if (whoami == 0) {
- // fake osd failures
- for (map<int,float>::iterator i = g_fake_osd_down.begin();
- i != g_fake_osd_down.end();
- i++) {
- dout(0) << "will fake osd" << i->first << " DOWN after " << i->second << endl;
- g_timer.add_event_after(i->second, new C_OM_FakeOSDFailure(this, i->first, 1));
- }
- for (map<int,float>::iterator i = g_fake_osd_out.begin();
- i != g_fake_osd_out.end();
- i++) {
- dout(0) << "will fake osd" << i->first << " OUT after " << i->second << endl;
- g_timer.add_event_after(i->second, new C_OM_FakeOSDFailure(this, i->first, 0));
- }
- }
-
-
- // i'm ready!
- messenger->set_dispatcher(this);
-
- // start ticker
- g_timer.add_event_after(g_conf.mon_tick_interval, new C_OM_PingTick(messenger));
-}
-
-
-void OSDMonitor::dispatch(Message *m)
-{
- switch (m->get_type()) {
- case MSG_OSD_FAILURE:
- handle_osd_failure((MOSDFailure*)m);
- break;
-
- case MSG_PING_ACK:
- handle_ping_ack((MPingAck*)m);
- break;
-
- case MSG_OSD_GETMAP:
- handle_osd_getmap((MOSDGetMap*)m);
- return;
-
- case MSG_OSD_BOOT:
- handle_osd_boot((MOSDBoot*)m);
- return;
-
- case MSG_OSD_IN:
- handle_osd_in((MOSDIn*)m);
- break;
- case MSG_OSD_OUT:
- handle_osd_out((MOSDOut*)m);
- break;
-
- case MSG_SHUTDOWN:
- handle_shutdown(m);
- return;
-
- case MSG_PING:
- tick();
- delete m;
- return;
-
- default:
- dout(0) << "unknown message " << *m << endl;
- assert(0);
- }
-}
-
-
-void OSDMonitor::handle_shutdown(Message *m)
-{
- dout(1) << "shutdown from " << m->get_source() << endl;
- messenger->shutdown();
- delete messenger;
- delete m;
-}
-
-void OSDMonitor::handle_ping_ack(MPingAck *m)
-{
- // ...
-
- delete m;
-}
-
-void OSDMonitor::handle_osd_failure(MOSDFailure *m)
-{
- dout(1) << "osd failure: " << m->get_failed() << " from " << m->get_source() << endl;
-
- // FIXME?
-
- // take their word for it
- int from = m->get_failed().num();
- if (osdmap->is_up(from) &&
- (osdmap->osd_inst.count(from) == 0 ||
- osdmap->osd_inst[from] == m->get_inst())) {
- pending.new_down[from] = m->get_inst();
-
- if (osdmap->is_in(from))
- pending_out[from] = g_clock.now();
-
- //awaiting_maps[pending.epoch][m->get_source()] =
-
- accept_pending();
- bcast_latest_osd_map_mds();
- //bcast_latest_osd_map_osd(); // FIXME: which osds can i tell?
- }
-
- send_incremental_map(m->get_epoch(), m->get_source());
-
- delete m;
-}
-
-
-
-void OSDMonitor::fake_osd_failure(int osd, bool down)
-{
- if (down) {
- dout(1) << "fake_osd_failure DOWN osd" << osd << endl;
- pending.new_down[osd] = osdmap->osd_inst[osd];
- } else {
- dout(1) << "fake_osd_failure OUT osd" << osd << endl;
- pending.new_out.push_back(osd);
- }
- accept_pending();
- bcast_latest_osd_map_osd();
- bcast_latest_osd_map_mds();
-}
-
-
-void OSDMonitor::handle_osd_boot(MOSDBoot *m)
-{
- dout(7) << "osd_boot from " << m->get_source() << endl;
- assert(m->get_source().is_osd());
- int from = m->get_source().num();
-
- if (osdmap->get_epoch() == 0) {
- // waiting for boot!
- osdmap->osd_inst[from] = m->get_source_inst();
-
- if (osdmap->osd_inst.size() == osdmap->osds.size()) {
- dout(-7) << "osd_boot all osds booted." << endl;
- osdmap->inc_epoch();
- osdmap->encode(maps[osdmap->get_epoch()]); // 1
- pending.epoch = osdmap->get_epoch()+1; // 2
-
- send_map();
- bcast_latest_osd_map_osd();
- bcast_latest_osd_map_mds();
- } else {
- dout(7) << "osd_boot waiting for "
- << (osdmap->osds.size() - osdmap->osd_inst.size())
- << " osds to boot" << endl;
- }
- return;
- }
-
- // already up? mark down first?
- if (osdmap->is_up(from)) {
- assert(m->get_source_inst() > osdmap->osd_inst[from]); // this better be newer!
- pending.new_down[from] = osdmap->osd_inst[from];
- accept_pending();
- }
-
- // mark up.
- pending_out.erase(from);
- assert(osdmap->is_down(from));
- pending.new_up[from] = m->get_source_inst();
-
- // mark in?
- if (osdmap->out_osds.count(from))
- pending.new_in.push_back(from);
-
- accept_pending();
-
- // the booting osd will spread word
- send_incremental_map(m->sb.current_epoch, m->get_source());
- delete m;
-
- // tell mds
- bcast_latest_osd_map_mds();
-}
-
-void OSDMonitor::handle_osd_in(MOSDIn *m)
-{
- dout(7) << "osd_in from " << m->get_source() << endl;
- int from = m->get_source().num();
- if (osdmap->is_out(from)) {
- pending.new_in.push_back(from);
- accept_pending();
- send_incremental_map(m->map_epoch, m->get_source());
- }
-}
-
-void OSDMonitor::handle_osd_out(MOSDOut *m)
-{
- dout(7) << "osd_out from " << m->get_source() << endl;
- int from = m->get_source().num();
- if (osdmap->is_in(from)) {
- pending.new_out.push_back(from);
- accept_pending();
- send_incremental_map(m->map_epoch, m->get_source());
- }
-}
-
-
-void OSDMonitor::handle_osd_getmap(MOSDGetMap *m)
-{
- dout(7) << "osd_getmap from " << m->get_source() << " since " << m->get_since() << endl;
-
- if (osdmap->get_epoch() == 0) {
- awaiting_map[1][m->get_source()] = m->get_since();
- } else {
- if (m->get_since())
- send_incremental_map(m->get_since(), m->get_source());
- else
- send_full_map(m->get_source());
- }
- delete m;
-}
-
-
-
-void OSDMonitor::accept_pending()
-{
- dout(-10) << "accept_pending " << osdmap->get_epoch() << " -> " << pending.epoch << endl;
-
- // accept pending into a new map!
- pending.encode( inc_maps[ pending.epoch ] );
-
- // advance!
- osdmap->apply_incremental(pending);
-
-
- // tell me about it
- for (map<int,entity_inst_t>::iterator i = pending.new_up.begin();
- i != pending.new_up.end();
- i++) {
- dout(0) << "osd" << i->first << " UP " << i->second << endl;
- derr(0) << "osd" << i->first << " UP " << i->second << endl;
- messenger->mark_up(MSG_ADDR_OSD(i->first), i->second);
- }
- for (map<int,entity_inst_t>::iterator i = pending.new_down.begin();
- i != pending.new_down.end();
- i++) {
- dout(0) << "osd" << i->first << " DOWN " << i->second << endl;
- derr(0) << "osd" << i->first << " DOWN " << i->second << endl;
- messenger->mark_down(MSG_ADDR_OSD(i->first), i->second);
- }
- for (list<int>::iterator i = pending.new_in.begin();
- i != pending.new_in.end();
- i++) {
- dout(0) << "osd" << *i << " IN" << endl;
- derr(0) << "osd" << *i << " IN" << endl;
- }
- for (list<int>::iterator i = pending.new_out.begin();
- i != pending.new_out.end();
- i++) {
- dout(0) << "osd" << *i << " OUT" << endl;
- derr(0) << "osd" << *i << " OUT" << endl;
- }
-
- // clear new pending
- OSDMap::Incremental next(osdmap->get_epoch() + 1);
- pending = next;
-}
-
-void OSDMonitor::send_map()
-{
- dout(10) << "send_map " << osdmap->get_epoch() << endl;
-
- map<msg_addr_t,epoch_t> s;
- s.swap( awaiting_map[osdmap->get_epoch()] );
- awaiting_map.erase(osdmap->get_epoch());
-
- for (map<msg_addr_t,epoch_t>::iterator i = s.begin();
- i != s.end();
- i++)
- send_incremental_map(i->second, i->first);
-}
-
-
-void OSDMonitor::send_full_map(msg_addr_t who)
-{
- messenger->send_message(new MOSDMap(osdmap), who);
-}
-
-void OSDMonitor::send_incremental_map(epoch_t since, msg_addr_t dest)
-{
- dout(-10) << "send_incremental_map " << since << " -> " << osdmap->get_epoch()
- << " to " << dest << endl;
-
- MOSDMap *m = new MOSDMap;
-
- for (epoch_t e = osdmap->get_epoch();
- e > since;
- e--) {
- bufferlist bl;
- if (inc_maps.count(e)) {
- dout(-10) << "send_incremental_map inc " << e << endl;
- m->incremental_maps[e] = inc_maps[e];
- } else if (maps.count(e)) {
- dout(-10) << "send_incremental_map full " << e << endl;
- m->maps[e] = maps[e];
- //if (!full) break;
- }
- else {
- assert(0); // we should have all maps.
- }
- }
-
- messenger->send_message(m, dest);
-}
-
-
-
-void OSDMonitor::bcast_latest_osd_map_mds()
-{
- epoch_t e = osdmap->get_epoch();
- dout(1) << "bcast_latest_osd_map_mds epoch " << e << endl;
-
- // tell mds
- for (int i=0; i<g_conf.num_mds; i++) {
- //send_full_map(MSG_ADDR_MDS(i));
- send_incremental_map(osdmap->get_epoch()-1, MSG_ADDR_MDS(i));
- }
-}
-
-void OSDMonitor::bcast_latest_osd_map_osd()
-{
- epoch_t e = osdmap->get_epoch();
- dout(1) << "bcast_latest_osd_map_osd epoch " << e << endl;
-
- // tell osds
- set<int> osds;
- osdmap->get_all_osds(osds);
- for (set<int>::iterator it = osds.begin();
- it != osds.end();
- it++) {
- if (osdmap->is_down(*it)) continue;
-
- send_incremental_map(osdmap->get_epoch()-1, MSG_ADDR_OSD(*it));
- }
-}
-
-
-
-void OSDMonitor::tick()
-{
- dout(10) << "tick" << endl;
-
- // mark down osds out?
- utime_t now = g_clock.now();
- list<int> mark_out;
- for (map<int,utime_t>::iterator i = pending_out.begin();
- i != pending_out.end();
- i++) {
- utime_t down = now;
- down -= i->second;
-
- if (down.sec() >= g_conf.mon_osd_down_out_interval) {
- dout(10) << "tick marking osd" << i->first << " OUT after " << down << " sec" << endl;
- mark_out.push_back(i->first);
- }
- }
- for (list<int>::iterator i = mark_out.begin();
- i != mark_out.end();
- i++) {
- pending_out.erase(*i);
- pending.new_out.push_back( *i );
- accept_pending();
- }
-
- // next!
- g_timer.add_event_after(g_conf.mon_tick_interval, new C_OM_PingTick(messenger));
-}
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-
-#ifndef __OSDMONITOR_H
-#define __OSDMONITOR_H
-
-#include <time.h>
-
-#include <map>
-#include <set>
-using namespace std;
-
-#include "include/types.h"
-#include "msg/Messenger.h"
-
-#include "osd/OSDMap.h"
-
-class OSDMonitor : public Dispatcher {
- // me
- int whoami;
- Messenger *messenger;
-
- // maps
- OSDMap *osdmap;
- map<epoch_t, bufferlist> maps;
- map<epoch_t, bufferlist> inc_maps;
-
- OSDMap::Incremental pending;
-
- map<epoch_t, map<msg_addr_t, epoch_t> > awaiting_map;
-
- // osd down -> out
- map<int,utime_t> pending_out;
-
-
- void tick(); // check state, take actions
-
- // maps
- void accept_pending(); // accept pending, new map.
- void send_map(); // send current map to waiters.
- void send_full_map(msg_addr_t dest);
- void send_incremental_map(epoch_t since, msg_addr_t dest);
- void bcast_latest_osd_map_mds();
- void bcast_latest_osd_map_osd();
-
-
- public:
- OSDMonitor(int w, Messenger *m) :
- whoami(w),
- messenger(m),
- osdmap(0) {
- }
-
- void init();
-
- void dispatch(Message *m);
- void handle_shutdown(Message *m);
-
- void handle_osd_boot(class MOSDBoot *m);
- void handle_osd_in(class MOSDIn *m);
- void handle_osd_out(class MOSDOut *m);
- void handle_osd_failure(class MOSDFailure *m);
- void handle_osd_getmap(class MOSDGetMap *m);
-
- void handle_ping_ack(class MPingAck *m);
-
- // hack
- void fake_osd_failure(int osd, bool down);
- void fake_osdmap_update();
- void fake_reorg();
-
-};
-
-#endif
#include "msg/Message.h"
#include "msg/Messenger.h"
-#include "events/EInodeUpdate.h"
-#include "events/EDirUpdate.h"
+#include "events/EString.h"
#include "events/EUnlink.h"
#include "messages/MRenameWarning.h"
// not import anymore!
cache->imports.erase(in->dir);
in->dir->state_clear(CDIR_STATE_IMPORT);
- in->dir->put(CDIR_PIN_IMPORT);
+ in->dir->put(CDir::PIN_IMPORT);
in->dir->set_dir_auth( CDIR_AUTH_PARENT );
dout(7) << " fixing dir_auth to be " << in->dir->get_dir_auth() << endl;
// i am now an import
cache->imports.insert(in->dir);
in->dir->state_set(CDIR_STATE_IMPORT);
- in->dir->get(CDIR_PIN_IMPORT);
+ in->dir->get(CDir::PIN_IMPORT);
in->dir->set_dir_auth( mds->get_nodeid() );
dout(7) << " fixing dir_auth to be " << in->dir->get_dir_auth() << endl;
// now export
cache->exports.insert(in->dir);
in->dir->state_set(CDIR_STATE_EXPORT);
- in->dir->get(CDIR_PIN_EXPORT);
+ in->dir->get(CDir::PIN_EXPORT);
assert(dir_auth >= 0); // better be defined
in->dir->set_dir_auth( dir_auth );
// remove from export list
cache->exports.erase(in->dir);
in->dir->state_clear(CDIR_STATE_EXPORT);
- in->dir->put(CDIR_PIN_EXPORT);
+ in->dir->put(CDir::PIN_EXPORT);
CDir *oldcon = cache->get_auth_container(srcdir);
assert(oldcon);
fix_renamed_dir(srcdir, in, destdir, false); // auth didnt change
// mark dentries dirty
- srcdn->mark_dirty();
- destdn->mark_dirty();
- in->mark_dirty();
+ srcdn->_mark_dirty(); // fixme
+ destdn->_mark_dirty(); // fixme
+ in->_mark_dirty(); // fixme
// local, restrict notify to ppl with open dirs
- set<int> notify = srcdir->get_open_by();
- for (set<int>::iterator it = destdir->open_by_begin();
- it != destdir->open_by_end();
+ set<int> notify;
+ for (map<int,int>::iterator it = srcdir->replicas_begin();
+ it != srcdir->replicas_end();
+ ++it)
+ notify.insert(it->first);
+ for (map<int,int>::iterator it = destdir->replicas_begin();
+ it != destdir->replicas_end();
it++)
- if (notify.count(*it) == 0) notify.insert(*it);
+ if (notify.count(it->first) == 0) notify.insert(it->first);
if (notify.size()) {
// warn + notify
if (in->is_dir() && in->dir)
fix_renamed_dir(srcdir, in, destdir, true); // auth changed
- srcdn->mark_dirty();
+ srcdn->_mark_dirty(); // fixme
// proxy!
- in->state_set(CINODE_STATE_PROXY);
- in->get(CINODE_PIN_PROXY);
+ in->state_set(CInode::STATE_PROXY);
+ in->get(CInode::PIN_PROXY);
// generate notify list (everybody but src|dst) and send warnings
set<int> notify;
// we got all our MNotifyAck's.
// was i proxy (if not, it's cuz this was a local rename)
- if (in->state_test(CINODE_STATE_PROXY)) {
+ if (in->state_test(CInode::STATE_PROXY)) {
dout(10) << "file_rename_ack clearing proxy bit on " << *in << endl;
- in->state_clear(CINODE_STATE_PROXY);
- in->put(CINODE_PIN_PROXY);
+ in->state_clear(CInode::STATE_PROXY);
+ in->put(CInode::PIN_PROXY);
}
// done!
}
// pin
- srcin->get(CINODE_PIN_RENAMESRC);
+ srcin->get(CInode::PIN_RENAMESRC);
// send rename request
MRenameReq *req = new MRenameReq(m->get_initiator(), // i'm the initiator
}
// mark dirty
- destdn->mark_dirty();
- in->mark_dirty();
+ destdn->_mark_dirty(); // fixme
+ in->_mark_dirty(); // fixme
// unpin
- in->put(CINODE_PIN_RENAMESRC);
+ in->put(CInode::PIN_RENAMESRC);
// ok, send notifies.
set<int> notify;
#include "Migrator.h"
#include "MDBalancer.h"
#include "Renamer.h"
+#include "MDStore.h"
#include "msg/Messenger.h"
#include "messages/MInodeLink.h"
-#include "events/EInodeUpdate.h"
-#include "events/EDirUpdate.h"
-#include "events/EMknod.h"
-#include "events/EMkdir.h"
+#include "events/EString.h"
+#include "events/EUpdate.h"
#include "include/filepath.h"
#include "common/Timer.h"
// ack
messenger->send_message(new MClientMountAck(m, mds->mdsmap, mds->osdmap),
- m->get_source(), m->get_source_inst());
+ m->get_source_inst());
delete m;
}
mds->clientmap.rem_mount(n);
- if (mds->clientmap.get_mount_set().empty()) {
+ if (g_conf.mds_shutdown_on_last_unmount &&
+ mds->clientmap.get_mount_set().empty()) {
dout(3) << "all clients done, initiating shutdown" << endl;
mds->shutdown_start();
}
// ack by sending back to client
- entity_inst_t srcinst = m->get_source_inst(); // make a copy!
- messenger->send_message(m, m->get_source(), srcinst);
+ messenger->send_message(m, m->get_source_inst());
}
// send reply
messenger->send_message(reply,
- MSG_ADDR_CLIENT(req->get_client()), req->get_client_inst());
+ req->get_client_inst());
// discard request
mdcache->request_finish(req);
}
+void Server::submit_update(MClientRequest *req,
+ CInode *wrlockedi,
+ LogEvent *event,
+ Context *oncommit)
+{
+ // log
+ mdlog->submit_entry(event);
+
+ // pin
+ mdcache->request_pin_inode(req, wrlockedi);
+
+ // wait
+ mdlog->wait_for_sync(oncommit);
+}
+
+
/*
* commit event(s) to the metadata journal, then reply.
* or, be sloppy and do it concurrently (see g_conf.mds_log_before_reply)
+ *
+ * NOTE: this is old and bad (write-behind!)
*/
void Server::commit_request(MClientRequest *req,
MClientReply *reply,
// send error
messenger->send_message(new MClientReply(req, r),
- MSG_ADDR_CLIENT(req->get_client()), req->get_client_inst());
+ req->get_client_inst());
// <HACK>
// is this a special debug command?
}
+// FIXME: this probably should go somewhere else.
+
+bool Server::try_open_dir(CInode *in, MClientRequest *req)
+{
+ if (!in->dir && in->is_frozen_dir()) {
+ // doh!
+ dout(10) << " dir inode is frozen, can't open dir, waiting " << *in << endl;
+ assert(in->get_parent_dir());
+ in->get_parent_dir()->add_waiter(CDIR_WAIT_UNFREEZE,
+ new C_MDS_RetryRequest(mds, req, in));
+ return false;
+ }
+
+ in->get_or_open_dir(mds->mdcache);
+ return true;
+}
+
+
+
+// ===============================================================================
// STAT
void Server::handle_client_stat(MClientRequest *req,
CInode *ref)
{
+ // FIXME: this is really not the way to handle the statlite mask.
+
// do I need file info?
int mask = req->get_iarg();
if (mask & (INODE_MASK_SIZE|INODE_MASK_MTIME)) {
mds->balancer->hit_inode(ref, META_POP_IRD);
// reply
- dout(10) << "reply to " << *req << " stat " << ref->inode.mtime << endl;
+ //dout(10) << "reply to " << *req << " stat " << ref->inode.mtime << endl;
MClientReply *reply = new MClientReply(req);
-
reply_request(req, reply, ref);
}
+
+// ===============================================================================
// INODE UPDATES
+
+/*
+ * finisher: do a inode_file_write_finish and reply.
+ */
+class C_MDS_utime_finish : public Context {
+ MDS *mds;
+ MClientRequest *req;
+ CInode *in;
+ version_t pv;
+ time_t mtime, atime;
+public:
+ C_MDS_utime_finish(MDS *m, MClientRequest *r, CInode *i, version_t pdv, time_t mt, time_t at) :
+ mds(m), req(r), in(i),
+ pv(pdv),
+ mtime(mt), atime(at) { }
+ void finish(int r) {
+ assert(r == 0);
+
+ // apply
+ in->inode.mtime = mtime;
+ in->inode.atime = atime;
+ in->mark_dirty(pv);
+
+ // unlock
+ mds->locker->inode_file_write_finish(in);
+
+ // reply
+ MClientReply *reply = new MClientReply(req, 0);
+ reply->set_result(0);
+ mds->server->reply_request(req, reply, in);
+ }
+};
+
+
// utime
void Server::handle_client_utime(MClientRequest *req,
- CInode *cur)
+ CInode *cur)
{
// write
if (!mds->locker->inode_file_write_start(cur, req))
return; // fw or (wait for) sync
- // do update
- cur->inode.mtime = req->get_targ();
- cur->inode.atime = req->get_targ2();
- if (cur->is_auth())
- cur->mark_dirty();
-
- mds->locker->inode_file_write_finish(cur);
-
mds->balancer->hit_inode(cur, META_POP_IWR);
- // init reply
- MClientReply *reply = new MClientReply(req, 0);
- reply->set_result(0);
-
- // commit
- commit_request(req, reply, cur,
- new EInodeUpdate(cur));
+ // prepare
+ version_t pdv = cur->pre_dirty();
+ time_t mtime = req->get_targ();
+ time_t atime = req->get_targ2();
+ C_MDS_utime_finish *fin = new C_MDS_utime_finish(mds, req, cur, pdv,
+ mtime, atime);
+
+ // log + wait
+ EUpdate *le = new EUpdate("utime");
+ le->metablob.add_dir_context(cur->get_parent_dir());
+ inode_t *pi = le->metablob.add_dentry(cur->parent, true);
+ pi->mtime = mtime;
+ pi->atime = mtime;
+ pi->version = pdv;
+
+ mdlog->submit_entry(le);
+ mdlog->wait_for_sync(fin);
}
-
-// HARD
+// --------------
+
+/*
+ * finisher: do a inode_hard_write_finish and reply.
+ */
+class C_MDS_chmod_finish : public Context {
+ MDS *mds;
+ MClientRequest *req;
+ CInode *in;
+ version_t pv;
+ int mode;
+public:
+ C_MDS_chmod_finish(MDS *m, MClientRequest *r, CInode *i, version_t pdv, int mo) :
+ mds(m), req(r), in(i), pv(pdv), mode(mo) { }
+ void finish(int r) {
+ assert(r == 0);
+
+ // apply
+ in->inode.mode &= ~04777;
+ in->inode.mode |= (mode & 04777);
+ in->mark_dirty(pv);
+
+ // unlock
+ mds->locker->inode_hard_write_finish(in);
+
+ // reply
+ MClientReply *reply = new MClientReply(req, 0);
+ reply->set_result(0);
+ mds->server->reply_request(req, reply, in);
+ }
+};
+
// chmod
void Server::handle_client_chmod(MClientRequest *req,
- CInode *cur)
+ CInode *cur)
{
// write
if (!mds->locker->inode_hard_write_start(cur, req))
return; // fw or (wait for) lock
-
- // check permissions
-
- // do update
+ mds->balancer->hit_inode(cur, META_POP_IWR);
+
+ // prepare
+ version_t pdv = cur->pre_dirty();
int mode = req->get_iarg();
- cur->inode.mode &= ~04777;
- cur->inode.mode |= (mode & 04777);
- cur->mark_dirty();
+ C_MDS_chmod_finish *fin = new C_MDS_chmod_finish(mds, req, cur, pdv,
+ mode);
+
+ // log + wait
+ EUpdate *le = new EUpdate("chmod");
+ le->metablob.add_dir_context(cur->get_parent_dir());
+ inode_t *pi = le->metablob.add_dentry(cur->parent, true);
+ pi->mode = mode;
+ pi->version = pdv;
+
+ mdlog->submit_entry(le);
+ mdlog->wait_for_sync(fin);
+}
- mds->locker->inode_hard_write_finish(cur);
- mds->balancer->hit_inode(cur, META_POP_IWR);
+// chown
- // start reply
- MClientReply *reply = new MClientReply(req, 0);
+class C_MDS_chown_finish : public Context {
+ MDS *mds;
+ MClientRequest *req;
+ CInode *in;
+ version_t pv;
+ int uid, gid;
+public:
+ C_MDS_chown_finish(MDS *m, MClientRequest *r, CInode *i, version_t pdv, int u, int g) :
+ mds(m), req(r), in(i), pv(pdv), uid(u), gid(g) { }
+ void finish(int r) {
+ assert(r == 0);
- // commit
- commit_request(req, reply, cur,
- new EInodeUpdate(cur));
-}
+ // apply
+ if (uid >= 0) in->inode.uid = uid;
+ if (gid >= 0) in->inode.gid = gid;
+ in->mark_dirty(pv);
+
+ // unlock
+ mds->locker->inode_hard_write_finish(in);
+
+ // reply
+ MClientReply *reply = new MClientReply(req, 0);
+ reply->set_result(0);
+ mds->server->reply_request(req, reply, in);
+ }
+};
-// chown
void Server::handle_client_chown(MClientRequest *req,
- CInode *cur)
+ CInode *cur)
{
// write
if (!mds->locker->inode_hard_write_start(cur, req))
return; // fw or (wait for) lock
- // check permissions
+ mds->balancer->hit_inode(cur, META_POP_IWR);
- // do update
+ // prepare
+ version_t pdv = cur->pre_dirty();
int uid = req->get_iarg();
int gid = req->get_iarg2();
- cur->inode.uid = uid;
- cur->inode.gid = gid;
- cur->mark_dirty();
-
- mds->locker->inode_hard_write_finish(cur);
-
- mds->balancer->hit_inode(cur, META_POP_IWR);
-
- // start reply
- MClientReply *reply = new MClientReply(req, 0);
-
- // commit
- commit_request(req, reply, cur,
- new EInodeUpdate(cur));
+ C_MDS_chown_finish *fin = new C_MDS_chown_finish(mds, req, cur, pdv,
+ uid, gid);
+
+ // log + wait
+ EUpdate *le = new EUpdate("chown");
+ le->metablob.add_dir_context(cur->get_parent_dir());
+ inode_t *pi = le->metablob.add_dentry(cur->parent, true);
+ if (uid >= 0) pi->uid = uid;
+ if (gid >= 0) pi->gid = gid;
+ pi->version = pdv;
+
+ mdlog->submit_entry(le);
+ mdlog->wait_for_sync(fin);
}
-bool Server::try_open_dir(CInode *in, MClientRequest *req)
-{
- if (!in->dir && in->is_frozen_dir()) {
- // doh!
- dout(10) << " dir inode is frozen, can't open dir, waiting " << *in << endl;
- assert(in->get_parent_dir());
- in->get_parent_dir()->add_waiter(CDIR_WAIT_UNFREEZE,
- new C_MDS_RetryRequest(mds, req, in));
- return false;
- }
- in->get_or_open_dir(mds);
- return true;
-}
+
+// =================================================================
// DIRECTORY and NAMESPACE OPS
// READDIR
int Server::encode_dir_contents(CDir *dir,
- list<InodeStat*>& inls,
- list<string>& dnls)
+ list<InodeStat*>& inls,
+ list<string>& dnls)
{
int numfiles = 0;
// hashed?
if (dir->is_hashed() &&
- mds->get_nodeid() != mds->hash_dentry( dir->ino(), it->first ))
+ mds->get_nodeid() != mds->mdcache->hash_dentry( dir->ino(), it->first ))
continue;
-
- // is dentry readable?
- if (dn->is_xlocked()) {
- // ***** FIXME *****
- // ?
- dout(10) << "warning, returning xlocked dentry, we _may_ be fudging on POSIX consistency" << endl;
- }
-
+
+ if (dn->is_null()) continue;
+
CInode *in = dn->inode;
- if (!in) continue; // null dentry?
+ if (!in)
+ continue; // hmm, fixme!, what about REMOTE links?
dout(12) << "including inode " << *in << endl;
// sent it back!
messenger->send_message(new MHashReaddirReply(dir->ino(), inls, dnls, num),
- m->get_source(), m->get_source_inst(), MDS_PORT_CACHE);
+ m->get_source_inst(), MDS_PORT_CACHE);
}
}
+
+// ------------------------------------------------
+
// MKNOD
-void Server::handle_client_mknod(MClientRequest *req, CInode *ref)
+class C_MDS_mknod_finish : public Context {
+ MDS *mds;
+ MClientRequest *req;
+ CDentry *dn;
+ CInode *newi;
+ version_t pv;
+public:
+ C_MDS_mknod_finish(MDS *m, MClientRequest *r, CDentry *d, CInode *ni) :
+ mds(m), req(r), dn(d), newi(ni),
+ pv(d->get_projected_version()) {}
+ void finish(int r) {
+ assert(r == 0);
+
+ // link the inode
+ dn->get_dir()->link_inode(dn, newi);
+
+ // dirty inode, dn, dir
+ newi->mark_dirty(pv);
+
+ // unlock
+ mds->locker->dentry_xlock_finish(dn);
+
+ // hit pop
+ mds->balancer->hit_inode(newi, META_POP_IWR);
+
+ // reply
+ MClientReply *reply = new MClientReply(req, 0);
+ reply->set_result(0);
+ mds->server->reply_request(req, reply, newi);
+ }
+};
+
+void Server::handle_client_mknod(MClientRequest *req, CInode *diri)
{
- // make dentry and inode, link.
- CInode *newi = mknod(req, ref);
- if (!newi) return;
+ CInode *newi = 0;
+ CDentry *dn = 0;
- // it's a file!
+ // make dentry and inode, xlock dentry.
+ if (!prepare_mknod(req, diri, &newi, &dn))
+ return;
+ assert(newi);
+ assert(dn);
+
+ // it's a file.
newi->inode.mode = req->get_iarg();
newi->inode.mode &= ~INODE_TYPE_MASK;
newi->inode.mode |= INODE_MODE_FILE;
- mds->balancer->hit_inode(newi, META_POP_IWR);
-
- // commit
- commit_request(req, new MClientReply(req, 0), ref,
- new EMknod(newi));
+ // prepare finisher
+ C_MDS_mknod_finish *fin = new C_MDS_mknod_finish(mds, req, dn, newi);
+ EUpdate *le = new EUpdate("mknod");
+ le->metablob.add_dir_context(diri->dir);
+ inode_t *pi = le->metablob.add_dentry(dn, true, newi);
+ pi->version = dn->get_projected_version();
+
+ // log + wait
+ mdlog->submit_entry(le);
+ mdlog->wait_for_sync(fin);
}
-// mknod(): used by handle_client_mkdir, handle_client_mknod, which are mostly identical.
-
-CInode *Server::mknod(MClientRequest *req, CInode *diri, bool okexist)
-{
- dout(10) << "mknod " << req->get_filepath() << " in " << *diri << endl;
- // get containing directory (without last bit)
- filepath dirpath = req->get_filepath().prefixpath(req->get_filepath().depth() - 1);
- string name = req->get_filepath().last_bit();
-
- // did we get to parent?
- dout(10) << "dirpath is " << dirpath << " depth " << dirpath.depth() << endl;
+/*
+ * verify that the dir exists and would own the dname.
+ * do not check if the dentry exists.
+ */
+CDir *Server::validate_new_dentry_dir(MClientRequest *req, CInode *diri, string& name)
+{
// make sure parent is a dir?
if (!diri->is_dir()) {
- dout(7) << "not a dir" << endl;
+ dout(7) << "validate_new_dentry_dir: not a dir" << endl;
reply_request(req, -ENOTDIR);
- return 0;
+ return false;
}
// am i not open, not auth?
if (!diri->dir && !diri->is_auth()) {
int dirauth = diri->authority();
- dout(7) << "don't know dir auth, not open, auth is i think " << dirauth << endl;
+ dout(7) << "validate_new_dentry_dir: don't know dir auth, not open, auth is i think mds" << dirauth << endl;
mdcache->request_forward(req, dirauth);
- return 0;
+ return false;
}
- if (!try_open_dir(diri, req)) return 0;
+ if (!try_open_dir(diri, req))
+ return false;
CDir *dir = diri->dir;
// make sure it's my dentry
int dnauth = dir->dentry_authority(name);
if (dnauth != mds->get_nodeid()) {
// fw
-
- dout(7) << "mknod on " << req->get_path() << ", dentry " << *dir << " dn " << name << " not mine, fw to " << dnauth << endl;
+ dout(7) << "mknod on " << req->get_path() << ", dentry " << *dir
+ << " dn " << name
+ << " not mine, fw to " << dnauth << endl;
mdcache->request_forward(req, dnauth);
- return 0;
+ return false;
}
- // ok, done passing buck.
+ // dir auth pinnable?
+ if (!dir->can_auth_pin()) {
+ dout(7) << "validate_new_dentry_dir: dir " << *dir << " not pinnable, waiting" << endl;
+ dir->add_waiter(CDIR_WAIT_AUTHPINNABLE,
+ new C_MDS_RetryRequest(mds, req, diri));
+ return false;
+ }
// frozen?
if (dir->is_frozen()) {
dout(7) << "dir is frozen " << *dir << endl;
dir->add_waiter(CDIR_WAIT_UNFREEZE,
new C_MDS_RetryRequest(mds, req, diri));
- return 0;
+ return false;
}
+ return dir;
+}
+
+/*
+ * prepare a mknod-type operation (mknod, mkdir, symlink, open+create).
+ * create the inode and dentry, but do not link them.
+ * pre_dirty the dentry+dir.
+ * xlock the dentry.
+ *
+ * return val
+ * 0 - wait for something
+ * 1 - created
+ * 2 - already exists (only if okexist=true)
+ */
+int Server::prepare_mknod(MClientRequest *req, CInode *diri,
+ CInode **pin, CDentry **pdn,
+ bool okexist)
+{
+ dout(10) << "prepare_mknod " << req->get_filepath() << " in " << *diri << endl;
+
+ // get containing directory (without last bit)
+ filepath dirpath = req->get_filepath().prefixpath(req->get_filepath().depth() - 1);
+ string name = req->get_filepath().last_bit();
+
+ CDir *dir = validate_new_dentry_dir(req, diri, name);
+ if (!dir) return 0;
+
// make sure name doesn't already exist
- CDentry *dn = dir->lookup(name);
- if (dn) {
- if (!dn->can_read(req)) {
- dout(10) << "waiting on (existing!) dentry " << *dn << endl;
+ *pdn = dir->lookup(name);
+ if (*pdn) {
+ if (!(*pdn)->can_read(req)) {
+ dout(10) << "waiting on (existing!) dentry " << **pdn << endl;
dir->add_waiter(CDIR_WAIT_DNREAD, name, new C_MDS_RetryRequest(mds, req, diri));
return 0;
}
- if (!dn->is_null()) {
+ if (!(*pdn)->is_null()) {
// name already exists
if (okexist) {
dout(10) << "dentry " << name << " exists in " << *dir << endl;
- return dn->inode;
+ *pin = (*pdn)->inode;
+ return 2;
} else {
dout(10) << "dentry " << name << " exists in " << *dir << endl;
reply_request(req, -EEXIST);
return 0;
}
- // create!
- CInode *newi = mdcache->create_inode();
- newi->inode.uid = req->get_caller_uid();
- newi->inode.gid = req->get_caller_gid();
- newi->inode.ctime = newi->inode.mtime = newi->inode.atime = g_clock.gettime(); // now
+ // make sure dir is pinnable
+
+
+ // create inode
+ *pin = mdcache->create_inode();
+ (*pin)->inode.uid = req->get_caller_uid();
+ (*pin)->inode.gid = req->get_caller_gid();
+ (*pin)->inode.ctime = (*pin)->inode.mtime = (*pin)->inode.atime = g_clock.gettime(); // now
+ // note: inode.version will get set by finisher's mark_dirty.
- // link
- if (!dn)
- dn = dir->add_dentry(name, newi);
- else
- dir->link_inode(dn, newi);
+ // create dentry
+ if (!*pdn)
+ *pdn = dir->add_dentry(name, 0);
+
+ (*pdn)->pre_dirty();
+
+ // xlock dentry
+ bool res = mds->locker->dentry_xlock_start(*pdn, req, diri);
+ assert(res == true);
// bump modify pop
mds->balancer->hit_dir(dir, META_POP_DWR);
+
+ return 1;
+}
+
+
+
+
+
+// MKDIR
+
+void Server::handle_client_mkdir(MClientRequest *req, CInode *diri)
+{
+ CInode *newi = 0;
+ CDentry *dn = 0;
- // mark dirty
- dn->mark_dirty();
- newi->mark_dirty();
+ // make dentry and inode, xlock dentry.
+ if (!prepare_mknod(req, diri, &newi, &dn))
+ return;
+ assert(newi);
+ assert(dn);
+
+ // it's a directory.
+ newi->inode.mode = req->get_iarg();
+ newi->inode.mode &= ~INODE_TYPE_MASK;
+ newi->inode.mode |= INODE_MODE_DIR;
+ newi->inode.layout = g_OSD_MDDirLayout;
+
+ // ...and that new dir is empty.
+ CDir *newdir = newi->get_or_open_dir(mds->mdcache);
+ newdir->mark_complete();
+ newdir->mark_dirty(newdir->pre_dirty());
+
+ // prepare finisher
+ C_MDS_mknod_finish *fin = new C_MDS_mknod_finish(mds, req, dn, newi);
+ EUpdate *le = new EUpdate("mkdir");
+ le->metablob.add_dir_context(diri->dir);
+ inode_t *pi = le->metablob.add_dentry(dn, true, newi);
+ pi->version = dn->get_projected_version();
+ le->metablob.add_dir(newi->dir, true);
- // journal it
- //mdlog->submit_entry(new EMknod(newi));
+ // log + wait
+ mdlog->submit_entry(le);
+ mdlog->wait_for_sync(fin);
- // ok!
- return newi;
+
+ /* old export heuristic. pbly need to reimplement this at some point.
+ if (
+ diri->dir->is_auth() &&
+ diri->dir->is_rep() &&
+ newdir->is_auth() &&
+ !newdir->is_hashing()) {
+ int dest = rand() % mds->mdsmap->get_num_mds();
+ if (dest != whoami) {
+ dout(10) << "exporting new dir " << *newdir << " in replicated parent " << *diri->dir << endl;
+ mdcache->migrator->export_dir(newdir, dest);
+ }
+ }
+ */
+}
+
+
+
+// SYMLINK
+
+void Server::handle_client_symlink(MClientRequest *req, CInode *diri)
+{
+ CInode *newi = 0;
+ CDentry *dn = 0;
+
+ // make dentry and inode, xlock dentry.
+ if (!prepare_mknod(req, diri, &newi, &dn))
+ return;
+ assert(newi);
+ assert(dn);
+
+ // it's a symlink
+ newi->inode.mode &= ~INODE_TYPE_MASK;
+ newi->inode.mode |= INODE_MODE_SYMLINK;
+ newi->symlink = req->get_sarg();
+
+ // prepare finisher
+ C_MDS_mknod_finish *fin = new C_MDS_mknod_finish(mds, req, dn, newi);
+ EUpdate *le = new EUpdate("symlink");
+ le->metablob.add_dir_context(diri->dir);
+ inode_t *pi = le->metablob.add_dentry(dn, true, newi);
+ pi->version = dn->get_projected_version();
+
+ // log + wait
+ mdlog->submit_entry(le);
+ mdlog->wait_for_sync(fin);
}
+
+
+
// LINK
class C_MDS_LinkTraverse : public Context {
{
// figure out name
string dname = req->get_filepath().last_bit();
- dout(7) << "dname is " << dname << endl;
-
- // make sure parent is a dir?
- if (!ref->is_dir()) {
- dout(7) << "not a dir " << *ref << endl;
- reply_request(req, -EINVAL);
- return;
- }
-
- // am i not open, not auth?
- if (!ref->dir && !ref->is_auth()) {
- int dirauth = ref->authority();
- dout(7) << "don't know dir auth, not open, srcdir auth is probably " << dirauth << endl;
- mdcache->request_forward(req, dirauth);
- return;
- }
-
- if (!try_open_dir(ref, req)) return;
- CDir *dir = ref->dir;
- dout(7) << "handle_client_link dir is " << *dir << endl;
-
-
-
- // make sure it's my dentry
- int dauth = dir->dentry_authority(dname);
- if (dauth != mds->get_nodeid()) {
- // fw
- dout(7) << "link on " << req->get_path() << ", dn " << dname << " in " << *dir << " not mine, fw to " << dauth << endl;
- mdcache->request_forward(req, dauth);
- return;
- }
- // ok, done passing buck.
+ dout(7) << "handle_client_link dname is " << dname << endl;
+ // validate dir
+ CDir *dir = validate_new_dentry_dir(req, ref, dname);
+ if (!dir) return;
- // exists?
+ // dentry exists?
CDentry *dn = dir->lookup(dname);
if (dn && (!dn->is_null() || dn->is_xlockedbyother(req))) {
dout(7) << "handle_client_link dn exists " << *dn << endl;
return;
}
- // keep src dir in memory
- mdcache->request_pin_dir(req, dir);
+ // xlock dentry
+ if (!dn->is_xlockedbyme(req)) {
+ if (!mds->locker->dentry_xlock_start(dn, req, ref))
+ return;
+ }
// discover link target
filepath target = req->get_sarg();
-
dout(7) << "handle_client_link discovering target " << target << endl;
-
C_MDS_LinkTraverse *onfinish = new C_MDS_LinkTraverse(this, req, ref);
Context *ondelay = new C_MDS_RetryRequest(mds, req, ref);
}
};
-void Server::handle_client_link_2(int r, MClientRequest *req, CInode *ref, vector<CDentry*>& trace)
+void Server::handle_client_link_2(int r, MClientRequest *req, CInode *diri, vector<CDentry*>& trace)
{
// target dne?
if (r < 0) {
return;
}
- // keep target inode in memory
- mdcache->request_pin_inode(req, targeti);
-
- dout(7) << "dir is " << *ref << endl;
-
- // xlock the dentry
- CDir *dir = ref->dir;
+ // what was the new dentry again?
+ CDir *dir = diri->dir;
assert(dir);
-
string dname = req->get_filepath().last_bit();
- int dauth = dir->dentry_authority(dname);
- if (mds->get_nodeid() != dauth) {
- // ugh, exported out from under us
- dout(7) << "ugh, forwarded out from under us, dentry auth is " << dauth << endl;
- mdcache->request_forward(req, dauth);
- return;
- }
-
CDentry *dn = dir->lookup(dname);
- if (dn && (!dn->is_null() || dn->is_xlockedbyother(req))) {
- dout(7) << "handle_client_link dn exists " << *dn << endl;
- reply_request(req, -EEXIST);
- return;
- }
+ assert(dn);
+ assert(dn->is_xlockedbyme(req));
- if (!dn) dn = dir->add_dentry(dname);
-
- if (!dn->is_xlockedbyme(req)) {
- if (!mds->locker->dentry_xlock_start(dn, req, ref)) {
- if (dn->is_clean() && dn->is_null() && dn->is_sync()) dir->remove_dentry(dn);
- return;
- }
- }
-
- // ok xlocked!
+ // ok!
if (targeti->is_auth()) {
// mine
- if (targeti->is_anchored()) {
+
+ // same dir?
+ if (targeti->get_parent_dir() == dn->get_dir()) {
+ dout(7) << "target is in the same dir, sweet" << endl;
+ }
+ else if (targeti->is_anchored()) {
dout(7) << "target anchored already (nlink=" << targeti->inode.nlink << "), sweet" << endl;
} else {
assert(targeti->inode.nlink == 1);
dout(7) << "target needs anchor, nlink=" << targeti->inode.nlink << ", creating anchor" << endl;
mdcache->anchor_inode(targeti,
- new C_MDS_RetryRequest(mds, req, ref));
+ new C_MDS_RetryRequest(mds, req, diri));
return;
}
// ok, inc link!
targeti->inode.nlink++;
dout(7) << "nlink++, now " << targeti->inode.nlink << " on " << *targeti << endl;
- targeti->mark_dirty();
+ targeti->_mark_dirty(); // fixme
} else {
// remote: send nlink++ request, wait
// wait
targeti->add_waiter(CINODE_WAIT_LINK,
- new C_MDS_RemoteLink(this, req, ref, dn, targeti));
+ new C_MDS_RemoteLink(this, req, diri, dn, targeti));
return;
}
- handle_client_link_finish(req, ref, dn, targeti);
+ handle_client_link_finish(req, diri, dn, targeti);
}
void Server::handle_client_link_finish(MClientRequest *req, CInode *ref,
- CDentry *dn, CInode *targeti)
+ CDentry *dn, CInode *targeti)
{
// create remote link
dn->dir->link_inode(dn, targeti->ino());
dn->link_remote( targeti ); // since we have it
- dn->mark_dirty();
+ dn->_mark_dirty(); // fixme
mds->balancer->hit_dir(dn->dir, META_POP_DWR);
MClientReply *reply = new MClientReply(req,0);
mdcache->dentry_unlink(dn,
new C_MDS_CommitRequest(this, req, reply, diri,
- new EInodeUpdate(diri))); // FIXME WRONG EVENT
+ new EString("unlink fixme")));
return;
}
MClientReply *reply = new MClientReply(req, 0);
mdcache->renamer->file_rename( srcdn, destdn,
new C_MDS_CommitRequest(this, req, reply, srcdn->inode,
- new EInodeUpdate(srcdn->inode)) ); // FIXME WRONG EVENT
-}
-
-
-
-
-
-
-
-// MKDIR
-
-void Server::handle_client_mkdir(MClientRequest *req, CInode *diri)
-{
- // make dentry and inode, link.
- CInode *newi = mknod(req, diri);
- if (!newi) return;
-
- // make my new inode a dir.
- newi->inode.mode = req->get_iarg();
- newi->inode.mode &= ~INODE_TYPE_MASK;
- newi->inode.mode |= INODE_MODE_DIR;
-
- // use dir layout
- newi->inode.layout = g_OSD_MDDirLayout;
-
- // init dir to be empty
- assert(!newi->is_frozen_dir()); // bc mknod worked
- CDir *newdir = newi->get_or_open_dir(mds);
- newdir->mark_complete();
- newdir->mark_dirty();
-
- mds->balancer->hit_dir(newdir, META_POP_DWR);
-
- if (
- diri->dir->is_auth() &&
- diri->dir->is_rep() &&
- newdir->is_auth() &&
- !newdir->is_hashing()) {
- int dest = rand() % mds->mdsmap->get_num_mds();
- if (dest != mds->get_nodeid()) {
- dout(10) << "exporting new dir " << *newdir << " in replicated parent " << *diri->dir << endl;
- mdcache->migrator->export_dir(newdir, dest);
- }
- }
-
- // commit to log
- commit_request(req, new MClientReply(req, 0), diri,
- new EMkdir(newdir));
- //new EInodeUpdate(newi),//);
- //new EDirUpdate(newdir)); // FIXME: weird performance regression here w/ double log; somewhat of a mystery!
- return;
+ new EString("file rename fixme")) );
}
-// SYMLINK
-
-void Server::handle_client_symlink(MClientRequest *req, CInode *diri)
-{
- // make dentry and inode, link.
- CInode *newi = mknod(req, diri);
- if (!newi) return;
-
- // make my new inode a symlink
- newi->inode.mode &= ~INODE_TYPE_MASK;
- newi->inode.mode |= INODE_MODE_SYMLINK;
-
- // set target
- newi->symlink = req->get_sarg();
-
- mds->balancer->hit_dir(diri->dir, META_POP_DWR);
-
- // commit
- commit_request(req, new MClientReply(req, 0), diri,
- new EInodeUpdate(newi)); // FIXME should be differnet log entry
-}
-
// do update
cur->inode.size = req->get_sizearg();
- cur->mark_dirty();
+ cur->_mark_dirty(); // fixme
mds->locker->inode_file_write_finish(cur);
// commit
commit_request(req, reply, cur,
- new EInodeUpdate(cur));
+ new EString("truncate fixme"));
}
// open, openc, close
void Server::handle_client_open(MClientRequest *req,
- CInode *cur)
+ CInode *cur)
{
int flags = req->get_iarg();
int mode = req->get_iarg2();
}
+class C_MDS_openc_finish : public Context {
+ MDS *mds;
+ MClientRequest *req;
+ CDentry *dn;
+ CInode *newi;
+ version_t pv;
+public:
+ C_MDS_openc_finish(MDS *m, MClientRequest *r, CDentry *d, CInode *ni) :
+ mds(m), req(r), dn(d), newi(ni),
+ pv(d->get_projected_version()) {}
+ void finish(int r) {
+ assert(r == 0);
+
+ // link the inode
+ dn->get_dir()->link_inode(dn, newi);
+
+ // dirty inode, dn, dir
+ newi->mark_dirty(pv);
+
+ // unlock
+ mds->locker->dentry_xlock_finish(dn);
+
+ // hit pop
+ mds->balancer->hit_inode(newi, META_POP_IWR);
+
+ // ok, do the open.
+ mds->server->handle_client_open(req, newi);
+ }
+};
+
-void Server::handle_client_openc(MClientRequest *req, CInode *ref)
+void Server::handle_client_openc(MClientRequest *req, CInode *diri)
{
dout(7) << "open w/ O_CREAT on " << req->get_filepath() << endl;
- CInode *in = mknod(req, ref, true);
- if (!in) return;
-
- in->inode.mode = 0644; // wtf FIXME
- in->inode.mode |= INODE_MODE_FILE;
+ CInode *in = 0;
+ CDentry *dn = 0;
+
+ // make dentry and inode, xlock dentry.
+ int r = prepare_mknod(req, diri, &in, &dn);
+ if (!r)
+ return; // wait on something
+ assert(in);
+ assert(dn);
+
+ if (r == 1) {
+ // created.
+ // it's a file.
+ in->inode.mode = 0644; // FIXME req should have a umask
+ in->inode.mode |= INODE_MODE_FILE;
+
+ // prepare finisher
+ C_MDS_openc_finish *fin = new C_MDS_openc_finish(mds, req, dn, in);
+ EUpdate *le = new EUpdate("openc");
+ le->metablob.add_dir_context(diri->dir);
+ inode_t *pi = le->metablob.add_dentry(dn, true, in);
+ pi->version = dn->get_projected_version();
+
+ // log + wait
+ mdlog->submit_entry(le);
+ mdlog->wait_for_sync(fin);
- handle_client_open(req, in);
+ /*
+ FIXME. this needs to be rewritten when the write capability stuff starts
+ getting journaled.
+ */
+ } else {
+ // exists!
+ // FIXME: do i need to repin path based existant inode? hmm.
+ handle_client_open(req, in);
+ }
}
// generic request helpers
void reply_request(MClientRequest *req, int r = 0, CInode *tracei = 0);
void reply_request(MClientRequest *req, MClientReply *reply, CInode *tracei);
+
+ void submit_update(MClientRequest *req, CInode *wrlockedi,
+ LogEvent *event,
+ Context *oncommit);
+
void commit_request(MClientRequest *req,
MClientReply *reply,
CInode *tracei,
void handle_client_truncate(MClientRequest *req, CInode *in);
void handle_client_fsync(MClientRequest *req, CInode *in);
+
+ // some helpers
CInode *mknod(MClientRequest *req, CInode *ref, bool okexist=false); // used by mknod, symlink, mkdir, openc
+ CDir *validate_new_dentry_dir(MClientRequest *req, CInode *diri, string& dname);
+ int prepare_mknod(MClientRequest *req, CInode *diri,
+ CInode **pin, CDentry **pdn,
+ bool okexist=false);
+
+
};
void print(ostream& out) {
if (what == EALLOC_EV_ALLOC)
- out << "alloc " << hex << id << dec << " tablev " << table_version;
+ out << "EAlloc alloc " << hex << id << dec << " tablev " << table_version;
else
- out << "dealloc " << hex << id << dec << " tablev " << table_version;
+ out << "EAlloc dealloc " << hex << id << dec << " tablev " << table_version;
}
- // live journal
- bool can_expire(MDS *mds) {
- if (mds->idalloc->get_committed_version() < table_version)
- return false; // still dirty
- else
- return true; // already flushed
- }
-
- void retire(MDS *mds, Context *c) {
- mds->idalloc->save(c, table_version);
- }
-
-
- // recovery
- bool has_happened(MDS *mds) {
- if (mds->idalloc->get_version() >= table_version) {
- cout << " event " << table_version << " <= table " << mds->idalloc->get_version() << endl;
- return true;
- } else
- return false;
- }
-
- void replay(MDS *mds) {
- assert(table_version-1 == mds->idalloc->get_version());
-
- if (what == EALLOC_EV_ALLOC) {
- idno_t nid = mds->idalloc->alloc_id(true);
- assert(nid == id); // this should match.
- }
- else if (what == EALLOC_EV_FREE) {
- mds->idalloc->reclaim_id(id, true);
- }
- else
- assert(0);
-
- assert(table_version == mds->idalloc->get_version());
- }
+ bool has_expired(MDS *mds);
+ void expire(MDS *mds, Context *c);
+ void replay(MDS *mds);
};
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-#ifndef __EDIRUPDATE_H
-#define __EDIRUPDATE_H
-
-#include <assert.h>
-#include "config.h"
-#include "include/types.h"
-
-#include "../LogEvent.h"
-#include "ETrace.h"
-#include "../CDir.h"
-#include "../MDCache.h"
-#include "../MDStore.h"
-
-
-
-class EDirUpdate : public LogEvent {
- protected:
- ETrace trace;
- inodeno_t dirino;
- version_t version;
-
- public:
- EDirUpdate(CDir *dir) : LogEvent(EVENT_DIRUPDATE),
- trace(dir->inode) {
- this->dirino = dir->ino();
- version = dir->get_version();
- }
- EDirUpdate() : LogEvent(EVENT_DIRUPDATE) {
- }
-
- void print(ostream& out) {
- out << "up dir " << dirino << " "
- << trace
- << "/ v " << version;
- }
-
- virtual void encode_payload(bufferlist& bl) {
- trace.encode(bl);
- bl.append((char*)&version, sizeof(version));
- bl.append((char*)&dirino, sizeof(dirino));
- }
- void decode_payload(bufferlist& bl, int& off) {
- trace.decode(bl, off);
- bl.copy(off, sizeof(version), (char*)&version);
- off += sizeof(version);
- bl.copy(off, sizeof(dirino), (char*)&dirino);
- off += sizeof(dirino);
- }
-
-
- virtual bool can_expire(MDS *mds) {
- // am i obsolete?
- CInode *in = mds->mdcache->get_inode(dirino);
- if (!in) return true;
- CDir *dir = in->dir;
- if (!dir) return true;
-
- dout(10) << "EDirUpdate v " << version << " on dir " << *dir << endl;
-
- if (!dir->is_auth()) return true; // not mine!
- if (dir->is_frozen()) return true; // frozen -> exporting -> obsolete? FIXME
-
- if (!dir->is_dirty()) return true;
-
- if (dir->get_committing_version() > version)
- return true;
-
- return false;
- }
-
- virtual void retire(MDS *mds, Context *c) {
- // commit directory
- CInode *in = mds->mdcache->get_inode(dirino);
- assert(in);
- CDir *dir = in->dir;
- assert(dir);
-
- dout(10) << "EDirUpdate committing dir " << *dir << endl;
- mds->mdstore->commit_dir(dir, c);
- }
-
-};
-
-#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef __EEXPORTFINISH_H
+#define __EEXPORTFINISH_H
+
+#include <assert.h>
+#include "config.h"
+#include "include/types.h"
+
+#include "../MDS.h"
+
+class EExportFinish : public LogEvent {
+ protected:
+ inodeno_t dirino; // exported dir
+ bool success;
+
+ public:
+ EExportFinish(CDir *dir, bool s) : LogEvent(EVENT_EXPORTFINISH),
+ dirino(dir->ino()),
+ success(s) { }
+ EExportFinish() : LogEvent(EVENT_EXPORTFINISH) { }
+
+ void print(ostream& out) {
+ out << "export_finish " << dirino;
+ if (success)
+ out << " success";
+ else
+ out << " failure";
+ }
+
+ virtual void encode_payload(bufferlist& bl) {
+ bl.append((char*)&dirino, sizeof(dirino));
+ bl.append((char*)&success, sizeof(success));
+ }
+ void decode_payload(bufferlist& bl, int& off) {
+ bl.copy(off, sizeof(dirino), (char*)&dirino);
+ off += sizeof(dirino);
+ bl.copy(off, sizeof(success), (char*)&success);
+ off += sizeof(success);
+ }
+
+ bool has_expired(MDS *mds);
+ void expire(MDS *mds, Context *c);
+ void replay(MDS *mds);
+
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef __EEXPORTSTART_H
+#define __EEXPORTSTART_H
+
+#include <assert.h>
+#include "config.h"
+#include "include/types.h"
+
+#include "../MDS.h"
+
+#include "EMetaBlob.h"
+
+class EExportStart : public LogEvent {
+ public:
+ EMetaBlob metablob; // exported dir
+ protected:
+ inodeno_t dirino;
+ int dest; // dest mds
+ set<inodeno_t> bounds;
+
+ public:
+ EExportStart(CDir *dir, int d) : LogEvent(EVENT_EXPORTSTART),
+ dirino(dir->ino()),
+ dest(d) {
+ metablob.add_dir_context(dir);
+ }
+ EExportStart() : LogEvent(EVENT_EXPORTSTART) { }
+
+ set<inodeno_t> &get_bounds() { return bounds; }
+
+ void print(ostream& out) {
+ out << "export_start " << dirino << " -> " << dest;
+ }
+
+ virtual void encode_payload(bufferlist& bl) {
+ metablob._encode(bl);
+ bl.append((char*)&dirino, sizeof(dirino));
+ bl.append((char*)&dest, sizeof(dest));
+ ::_encode(bounds, bl);
+ }
+ void decode_payload(bufferlist& bl, int& off) {
+ metablob._decode(bl, off);
+ bl.copy(off, sizeof(dirino), (char*)&dirino);
+ off += sizeof(dirino);
+ bl.copy(off, sizeof(dest), (char*)&dest);
+ off += sizeof(dest);
+ ::_decode(bounds, bl, off);
+ }
+
+ bool has_expired(MDS *mds);
+ void expire(MDS *mds, Context *c);
+ void replay(MDS *mds);
+
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef __EIMPORTFINISH_H
+#define __EIMPORTFINISH_H
+
+#include <assert.h>
+#include "config.h"
+#include "include/types.h"
+
+#include "../MDS.h"
+
+class EImportFinish : public LogEvent {
+ protected:
+ inodeno_t dirino; // imported dir
+ bool success;
+
+ public:
+ EImportFinish(CDir *dir, bool s) : LogEvent(EVENT_IMPORTFINISH),
+ dirino(dir->ino()),
+ success(s) { }
+ EImportFinish() : LogEvent(EVENT_IMPORTFINISH) { }
+
+ void print(ostream& out) {
+ out << "import_finish " << dirino;
+ if (success)
+ out << " success";
+ else
+ out << " failed";
+ }
+
+ virtual void encode_payload(bufferlist& bl) {
+ bl.append((char*)&dirino, sizeof(dirino));
+ bl.append((char*)&success, sizeof(success));
+ }
+ void decode_payload(bufferlist& bl, int& off) {
+ bl.copy(off, sizeof(dirino), (char*)&dirino);
+ off += sizeof(dirino);
+ bl.copy(off, sizeof(success), (char*)&success);
+ off += sizeof(success);
+ }
+
+ bool has_expired(MDS *mds);
+ void expire(MDS *mds, Context *c);
+ void replay(MDS *mds);
+
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef __MDS_EIMPORTMAP_H
+#define __MDS_EIMPORTMAP_H
+
+#include "../LogEvent.h"
+#include "EMetaBlob.h"
+
+class EImportMap : public LogEvent {
+public:
+ EMetaBlob metablob;
+ set<inodeno_t> imports;
+ set<inodeno_t> exports;
+ //set<inodeno_t> hashdirs;
+ map<inodeno_t, set<inodeno_t> > nested_exports;
+
+ EImportMap() : LogEvent(EVENT_IMPORTMAP) { }
+
+ void print(ostream& out) {
+ out << "import_map " << imports.size() << " imports, "
+ << exports.size() << " exports"
+ << " " << metablob;
+ }
+
+ void encode_payload(bufferlist& bl) {
+ metablob._encode(bl);
+ ::_encode(imports, bl);
+ ::_encode(exports, bl);
+ for (set<inodeno_t>::iterator p = imports.begin();
+ p != imports.end();
+ ++p) {
+ ::_encode(nested_exports[*p], bl);
+ if (nested_exports[*p].empty())
+ nested_exports.erase(*p);
+ }
+ }
+ void decode_payload(bufferlist& bl, int& off) {
+ metablob._decode(bl, off);
+ ::_decode(imports, bl, off);
+ ::_decode(exports, bl, off);
+ for (set<inodeno_t>::iterator p = imports.begin();
+ p != imports.end();
+ ++p) {
+ ::_decode(nested_exports[*p], bl, off);
+ if (nested_exports[*p].empty())
+ nested_exports.erase(*p);
+ }
+ }
+
+ bool has_expired(MDS *mds);
+ void expire(MDS *mds, Context *c);
+ void replay(MDS *mds);
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef __EIMPORTSTART_H
+#define __EIMPORTSTART_H
+
+#include <assert.h>
+#include "config.h"
+#include "include/types.h"
+
+#include "../MDS.h"
+
+#include "EMetaBlob.h"
+
+class EImportStart : public LogEvent {
+protected:
+ inodeno_t dirino;
+ list<inodeno_t> bounds;
+
+ public:
+ EMetaBlob metablob;
+
+ EImportStart(inodeno_t di,
+ list<inodeno_t>& b) : LogEvent(EVENT_IMPORTSTART),
+ dirino(di), bounds(b) { }
+ EImportStart() : LogEvent(EVENT_IMPORTSTART) { }
+
+ void print(ostream& out) {
+ out << "EImportStart " << metablob;
+ }
+
+ virtual void encode_payload(bufferlist& bl) {
+ bl.append((char*)&dirino, sizeof(dirino));
+ metablob._encode(bl);
+ ::_encode(bounds, bl);
+ }
+ void decode_payload(bufferlist& bl, int& off) {
+ bl.copy(off, sizeof(dirino), (char*)&dirino);
+ off += sizeof(dirino);
+ metablob._decode(bl, off);
+ ::_decode(bounds, bl, off);
+ }
+
+ bool has_expired(MDS *mds);
+ void expire(MDS *mds, Context *c);
+ void replay(MDS *mds);
+
+};
+
+#endif
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-#ifndef __EINODEUPDATE_H
-#define __EINODEUPDATE_H
-
-#include <assert.h>
-#include "config.h"
-#include "include/types.h"
-
-#include "../LogEvent.h"
-#include "ETrace.h"
-
-
-class EInodeUpdate : public LogEvent {
- protected:
- ETrace trace;
-
- public:
- EInodeUpdate(CInode *in) : LogEvent(EVENT_INODEUPDATE),
- trace(in) {
- }
- EInodeUpdate() : LogEvent(EVENT_INODEUPDATE) { }
-
- void print(ostream& out) {
- out << "up inode " << trace.back().inode.ino
- << " " << trace
- << " v " << trace.back().inode.version;
- }
-
- virtual void encode_payload(bufferlist& bl) {
- trace.encode(bl);
- }
- void decode_payload(bufferlist& bl, int& off) {
- trace.decode(bl, off);
- }
-
- bool can_expire(MDS *mds);
- void retire(MDS *mds, Context *c);
- bool has_happened(MDS *mds);
- void replay(MDS *mds);
-
-};
-
-#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef __MDS_EMETABLOB_H
+#define __MDS_EMETABLOB_H
+
+#include <stdlib.h>
+#include <string>
+using namespace std;
+
+#include "../CInode.h"
+#include "../CDir.h"
+#include "../CDentry.h"
+
+
+class MDS;
+
+/*
+ * a bunch of metadata in the journal
+ */
+
+/* notes:
+ *
+ * - make sure you adjust the inode.version for any modified inode you
+ * journal. CDir and CDentry maintain a projected_version, but CInode
+ * doesn't, since the journaled inode usually has to be modifed
+ * manually anyway (to delay the change in the MDS's cache until after
+ * it is journaled).
+ *
+ */
+
+
+class EMetaBlob {
+
+ /* fullbit - a regular dentry + inode
+ */
+ struct fullbit {
+ string dn; // dentry
+ version_t dnv;
+ inode_t inode; // if it's not
+ string symlink;
+ bool dirty;
+
+ fullbit(const string& d, version_t v, inode_t& i, bool dr) : dn(d), dnv(v), inode(i), dirty(dr) { }
+ fullbit(const string& d, version_t v, inode_t& i, string& sym, bool dr) : dn(d), dnv(v), inode(i), symlink(sym), dirty(dr) { }
+ fullbit(bufferlist& bl, int& off) { _decode(bl, off); }
+ void _encode(bufferlist& bl) {
+ ::_encode(dn, bl);
+ bl.append((char*)&dnv, sizeof(dnv));
+ bl.append((char*)&inode, sizeof(inode));
+ if (inode.is_symlink())
+ ::_encode(symlink, bl);
+ bl.append((char*)&dirty, sizeof(dirty));
+ }
+ void _decode(bufferlist& bl, int& off) {
+ ::_decode(dn, bl, off);
+ bl.copy(off, sizeof(dnv), (char*)&dnv);
+ off += sizeof(dnv);
+ bl.copy(off, sizeof(inode), (char*)&inode);
+ off += sizeof(inode);
+ if (inode.is_symlink())
+ ::_decode(symlink, bl, off);
+ bl.copy(off, sizeof(dirty), (char*)&dirty);
+ off += sizeof(dirty);
+ }
+ };
+
+ /* remotebit - a dentry + remote inode link (i.e. just an ino)
+ */
+ struct remotebit {
+ string dn;
+ version_t dnv;
+ inodeno_t ino;
+ bool dirty;
+
+ remotebit(const string& d, version_t v, inodeno_t i, bool dr) : dn(d), dnv(v), ino(i), dirty(dr) { }
+ remotebit(bufferlist& bl, int& off) { _decode(bl, off); }
+ void _encode(bufferlist& bl) {
+ ::_encode(dn, bl);
+ bl.append((char*)&dnv, sizeof(dnv));
+ bl.append((char*)&ino, sizeof(ino));
+ bl.append((char*)&dirty, sizeof(dirty));
+ }
+ void _decode(bufferlist& bl, int& off) {
+ ::_decode(dn, bl, off);
+ bl.copy(off, sizeof(dnv), (char*)&dnv);
+ off += sizeof(dnv);
+ bl.copy(off, sizeof(ino), (char*)&ino);
+ off += sizeof(ino);
+ bl.copy(off, sizeof(dirty), (char*)&dirty);
+ off += sizeof(dirty);
+ }
+ };
+
+ /*
+ * nullbit - a null dentry
+ */
+ struct nullbit {
+ string dn;
+ version_t dnv;
+ bool dirty;
+ nullbit(const string& d, version_t v, bool dr) : dn(d), dnv(v), dirty(dr) { }
+ nullbit(bufferlist& bl, int& off) { _decode(bl, off); }
+ void _encode(bufferlist& bl) {
+ ::_encode(dn, bl);
+ bl.append((char*)&dnv, sizeof(dnv));
+ bl.append((char*)&dirty, sizeof(dirty));
+ }
+ void _decode(bufferlist& bl, int& off) {
+ ::_decode(dn, bl, off);
+ bl.copy(off, sizeof(dnv), (char*)&dnv);
+ off += sizeof(dnv);
+ bl.copy(off, sizeof(dirty), (char*)&dirty);
+ off += sizeof(dirty);
+ }
+ };
+
+
+ /* dirlump - contains metadata for any dir we have contents for.
+ */
+ struct dirlump {
+ static const int STATE_IMPORT = (1<<0);
+ static const int STATE_COMPLETE = (1<<1);
+ static const int STATE_DIRTY = (1<<2); // dirty due to THIS journal item, that is!
+
+ dirslice_t dirslice;
+ version_t dirv;
+ int state;
+ int nfull, nremote, nnull;
+ bufferlist bfull, bremote, bnull;
+
+ private:
+ bool dn_decoded;
+ list<fullbit> dfull;
+ list<remotebit> dremote;
+ list<nullbit> dnull;
+
+ public:
+ dirlump() : state(0), nfull(0), nremote(0), nnull(0), dn_decoded(true) { }
+
+ bool is_import() { return state & STATE_IMPORT; }
+ void mark_import() { state |= STATE_IMPORT; }
+ bool is_complete() { return state & STATE_COMPLETE; }
+ void mark_complete() { state |= STATE_COMPLETE; }
+ bool is_dirty() { return state & STATE_DIRTY; }
+ void mark_dirty() { state |= STATE_DIRTY; }
+
+ list<fullbit> &get_dfull() { return dfull; }
+ list<remotebit> &get_dremote() { return dremote; }
+ list<nullbit> &get_dnull() { return dnull; }
+
+ void _encode_bits() {
+ for (list<fullbit>::iterator p = dfull.begin(); p != dfull.end(); ++p)
+ p->_encode(bfull);
+ for (list<remotebit>::iterator p = dremote.begin(); p != dremote.end(); ++p)
+ p->_encode(bremote);
+ for (list<nullbit>::iterator p = dnull.begin(); p != dnull.end(); ++p)
+ p->_encode(bnull);
+ }
+ void _decode_bits() {
+ if (dn_decoded) return;
+ int off = 0;
+ for (int i=0; i<nfull; i++)
+ dfull.push_back(fullbit(bfull, off));
+ off = 0;
+ for (int i=0; i<nremote; i++)
+ dremote.push_back(remotebit(bremote, off));
+ off = 0;
+ for (int i=0; i<nnull; i++)
+ dnull.push_back(nullbit(bnull, off));
+ dn_decoded = true;
+ }
+
+ void _encode(bufferlist& bl) {
+ bl.append((char*)&dirslice, sizeof(dirslice));
+ bl.append((char*)&dirv, sizeof(dirv));
+ bl.append((char*)&state, sizeof(state));
+ bl.append((char*)&nfull, sizeof(nfull));
+ bl.append((char*)&nremote, sizeof(nremote));
+ bl.append((char*)&nnull, sizeof(nnull));
+ _encode_bits();
+ ::_encode(bfull, bl);
+ ::_encode(bremote, bl);
+ ::_encode(bnull, bl);
+ }
+ void _decode(bufferlist& bl, int& off) {
+ bl.copy(off, sizeof(dirslice), (char*)&dirslice); off += sizeof(dirslice);
+ bl.copy(off, sizeof(dirv), (char*)&dirv); off += sizeof(dirv);
+ bl.copy(off, sizeof(state), (char*)&state); off += sizeof(state);
+ bl.copy(off, sizeof(nfull), (char*)&nfull); off += sizeof(nfull);
+ bl.copy(off, sizeof(nremote), (char*)&nremote); off += sizeof(nremote);
+ bl.copy(off, sizeof(nnull), (char*)&nnull); off += sizeof(nnull);
+ ::_decode(bfull, bl, off);
+ ::_decode(bremote, bl, off);
+ ::_decode(bnull, bl, off);
+ // don't decode bits unless we need them.
+ dn_decoded = false;
+ }
+ };
+
+ // my lumps. preserve the order we added them in a list.
+ list<inodeno_t> lump_order;
+ map<inodeno_t, dirlump> lump_map;
+
+ public:
+
+ // remote pointer to to-be-journaled inode iff it's a normal (non-remote) dentry
+ inode_t *add_dentry(CDentry *dn, bool dirty, CInode *in=0) {
+ CDir *dir = dn->get_dir();
+ if (!in) in = dn->get_inode();
+
+ // add the dir
+ dirlump& lump = add_dir(dir, false);
+
+ // add the dirbit
+ if (dn->is_remote()) {
+ lump.nremote++;
+ if (dirty)
+ lump.get_dremote().push_front(remotebit(dn->get_name(),
+ dn->get_projected_version(),
+ dn->get_remote_ino(),
+ dirty));
+ else
+ lump.get_dremote().push_back(remotebit(dn->get_name(),
+ dn->get_projected_version(),
+ dn->get_remote_ino(),
+ dirty));
+ }
+ else if (!in) {
+ lump.nnull++;
+ if (dirty)
+ lump.get_dnull().push_front(nullbit(dn->get_name(),
+ dn->get_projected_version(),
+ dirty));
+ else
+ lump.get_dnull().push_back(nullbit(dn->get_name(),
+ dn->get_projected_version(),
+ dirty));
+ }
+ else {
+ lump.nfull++;
+ if (dirty) {
+ lump.get_dfull().push_front(fullbit(dn->get_name(),
+ dn->get_projected_version(),
+ in->inode, in->symlink,
+ dirty));
+ return &lump.get_dfull().front().inode;
+ } else {
+ lump.get_dfull().push_back(fullbit(dn->get_name(),
+ dn->get_projected_version(),
+ in->inode, in->symlink,
+ dirty));
+ return &lump.get_dfull().back().inode;
+ }
+ }
+ return 0;
+ }
+
+ dirlump& add_dir(CDir *dir, bool dirty) {
+ if (lump_map.count(dir->ino()) == 0) {
+ lump_order.push_back(dir->ino());
+ lump_map[dir->ino()].dirv = dir->get_projected_version();
+ }
+ dirlump& l = lump_map[dir->ino()];
+ if (dir->is_complete()) l.mark_complete();
+ if (dir->is_import()) l.mark_import();
+ if (dirty) l.mark_dirty();
+ return l;
+ }
+
+ void add_dir_context(CDir *dir, bool toroot=false) {
+ // already have this dir? (we must always add in order)
+ if (lump_map.count(dir->ino()))
+ return;
+
+ CInode *diri = dir->get_inode();
+ if (!toroot &&
+ (dir->is_import() || dir->is_hashed()))
+ return; // stop at import point
+ if (!dir->get_inode()->get_parent_dn())
+ return;
+
+ CDentry *parent = diri->get_parent_dn();
+ add_dir_context(parent->get_dir(), toroot);
+ add_dentry(parent, false);
+ }
+
+
+ // encoding
+
+ void _encode(bufferlist& bl) {
+ int n = lump_map.size();
+ bl.append((char*)&n, sizeof(n));
+ for (list<inodeno_t>::iterator i = lump_order.begin();
+ i != lump_order.end();
+ ++i) {
+ bl.append((char*)&(*i), sizeof(*i));
+ lump_map[*i]._encode(bl);
+ }
+ }
+ void _decode(bufferlist& bl, int& off) {
+ int n;
+ bl.copy(off, sizeof(n), (char*)&n);
+ off += sizeof(n);
+ for (int i=0; i<n; i++) {
+ inodeno_t dirino;
+ bl.copy(off, sizeof(dirino), (char*)&dirino);
+ off += sizeof(dirino);
+ lump_order.push_back(dirino);
+ lump_map[dirino]._decode(bl, off);
+ }
+ }
+
+ void print(ostream& out) const {
+ out << "[metablob " << lump_order.front()
+ << ", " << lump_map.size() << " dirs]";
+ }
+
+ bool has_expired(MDS *mds);
+ void expire(MDS *mds, Context *c);
+ void replay(MDS *mds);
+};
+
+inline ostream& operator<<(ostream& out, const EMetaBlob& t) {
+ t.print(out);
+ return out;
+}
+
+#endif
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-#ifndef __EMKDIR_H
-#define __EMKDIR_H
-
-#include <assert.h>
-#include "config.h"
-#include "include/types.h"
-
-#include "ETrace.h"
-#include "../MDS.h"
-#include "../MDStore.h"
-
-
-class EMkdir : public LogEvent {
- protected:
- ETrace trace;
- //version_t pdirv;
-
- public:
- EMkdir(CDir *dir) : LogEvent(EVENT_MKDIR),
- trace(dir->inode) {
- //pdirv = dir->inode->get_parent_dir()->get_version();
- }
- EMkdir() : LogEvent(EVENT_MKDIR) { }
-
- void print(ostream& out) {
- out << "mkdir ";
- trace.print(out);
- }
-
- virtual void encode_payload(bufferlist& bl) {
- trace.encode(bl);
- //bl.append((char*)&pdirv, sizeof(pdirv));
- }
- void decode_payload(bufferlist& bl, int& off) {
- trace.decode(bl, off);
- //bl.copy(off, sizeof(pdirv), (char*)&pdirv);
- //off += sizeof(pdirv);
- }
-
- bool can_expire(MDS *mds);
- void retire(MDS *mds, Context *c);
-
- // recovery
- bool has_happened(MDS *mds);
- void replay(MDS *mds);
-
-};
-
-#endif
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-#ifndef __EMKNOD_H
-#define __EMKNOD_H
-
-#include <assert.h>
-#include "config.h"
-#include "include/types.h"
-
-#include "../LogEvent.h"
-#include "ETrace.h"
-#include "../MDS.h"
-#include "../MDStore.h"
-
-
-class EMknod : public LogEvent {
- protected:
- ETrace trace;
- //version_t pdirv;
-
- public:
- EMknod(CInode *in) : LogEvent(EVENT_MKNOD),
- trace(in) {
- //pdirv = in->get_parent_dir()->get_version();
- }
- EMknod() : LogEvent(EVENT_MKNOD) { }
-
- void print(ostream& out) {
- out << "mknod " << trace;
- }
-
- virtual void encode_payload(bufferlist& bl) {
- trace.encode(bl);
- //bl.append((char*)&pdirv, sizeof(pdirv));
- }
- void decode_payload(bufferlist& bl, int& off) {
- trace.decode(bl, off);
- //bl.copy(off, sizeof(pdirv), (char*)&pdirv);
- //off += sizeof(pdirv);
- }
-
- bool can_expire(MDS *mds);
- void retire(MDS *mds, Context *c);
- bool has_happened(MDS *mds);
- void replay(MDS *mds);
-
-};
-
-#endif
bl.copy(off, sizeof(ino), (char*)&ino);
}
- bool can_expire(MDS *mds);
- void retire(MDS *mds, Context *c);
- bool has_happened(MDS *mds);
+ bool has_expired(MDS *mds);
+ void expire(MDS *mds, Context *c);
void replay(MDS *mds);
};
event = bl.c_str() + off;
off += event.length() + 1;
}
-
void encode_payload(bufferlist& bl) {
bl.append(event.c_str(), event.length()+1);
}
out << '"' << event << '"';
}
+ bool has_expired(MDS *mds);
+ void expire(MDS *mds, Context *c);
+ void replay(MDS *mds);
+
};
#endif
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-#ifndef __MDS_ETRACE_H
-#define __MDS_ETRACE_H
-
-#include <stdlib.h>
-#include <string>
-using namespace std;
-
-#include "../CInode.h"
-#include "../CDir.h"
-#include "../CDentry.h"
-
-
-// path trace for use in journal events
-
-class ETrace {
-
- // <dir, dn, inode> segment.
- struct bit {
- inodeno_t dirino;
- version_t dirv;
- string dn;
- inode_t inode;
-
- bit(bufferlist& bl, int& off) { _decode(bl,off); }
- bit(inodeno_t di, version_t dv, const string& d, inode_t i) :
- dirino(di), dirv(dv), dn(d), inode(i) {}
-
- void _encode(bufferlist& bl) {
- bl.append((char*)&dirino, sizeof(dirino));
- bl.append((char*)&dirv, sizeof(dirv));
- ::_encode(dn, bl);
- bl.append((char*)&inode, sizeof(inode));
- }
- void _decode(bufferlist& bl, int& off) {
- bl.copy(off, sizeof(dirino), (char*)&dirino); off += sizeof(dirino);
- bl.copy(off, sizeof(dirv), (char*)&dirv); off += sizeof(dirv);
- ::_decode(dn, bl, off);
- bl.copy(off, sizeof(inode), (char*)&inode); off += sizeof(inode);
- }
- };
-
- public:
- list<bit> trace;
-
- ETrace(CInode *in = 0) {
- if (in) {
- CDir *dir;
- CDentry *dn;
- do {
- dn = in->get_parent_dn();
- if (!dn) break;
- dir = dn->get_dir();
- if (!dir) break;
-
- trace.push_front(bit(dir->ino(),
- dir->get_version(),
- dn->get_name(),
- in->inode));
-
- in = dir->get_inode();
- } while (!dir->is_import());
- }
- }
-
- bit& back() {
- return trace.back();
- }
-
- void decode(bufferlist& bl, int& off) {
- int n;
- bl.copy(off, sizeof(n), (char*)&n);
- off += sizeof(n);
- for (int i=0; i<n; i++)
- trace.push_back( bit(bl, off) );
- }
-
- void encode(bufferlist& bl) {
- int n = trace.size();
- bl.append((char*)&n, sizeof(n));
- for (list<bit>::iterator i = trace.begin();
- i != trace.end();
- i++)
- i->_encode(bl);
- }
-
- void print(ostream& out) const {
- for (list<bit>::const_iterator p = trace.begin();
- p != trace.end();
- p++) {
- if (p == trace.begin())
- out << "[" << p->dirino << "]/" << p->dn;
- else
- out << "/" << p->dn;
- }
- }
-
- CInode *restore_trace(MDS *mds);
-
-};
-
-inline ostream& operator<<(ostream& out, const ETrace& t) {
- t.print(out);
- return out;
-}
-
-#endif
#include "include/types.h"
#include "../LogEvent.h"
-#include "ETrace.h"
+#include "EMetaBlob.h"
#include "../CInode.h"
#include "../CDentry.h"
#include "../CDir.h"
+/// help rewrite me
+
class EUnlink : public LogEvent {
protected:
- ETrace diritrace;
version_t dirv;
string dname;
- ETrace inodetrace;
public:
+ EMetaBlob metaglob;
+
+ /*
EUnlink(CDir *dir, CDentry* dn, CInode *in) :
LogEvent(EVENT_UNLINK),
diritrace(dir->inode),
dirv(dir->get_version()),
dname(dn->get_name()),
inodetrace(in) {}
+ */
EUnlink() : LogEvent(EVENT_UNLINK) { }
virtual void encode_payload(bufferlist& bl) {
+ /*
diritrace.encode(bl);
bl.append((char*)&dirv, sizeof(dirv));
::_encode(dname, bl);
inodetrace.encode(bl);
+ */
}
void decode_payload(bufferlist& bl, int& off) {
+ /*
diritrace.decode(bl,off);
bl.copy(off, sizeof(dirv), (char*)&dirv);
off += sizeof(dirv);
::_decode(dname, bl, off);
inodetrace.decode(bl, off);
+ */
}
- bool can_expire(MDS *mds);
- void retire(MDS *mds, Context *c);
- bool has_happened(MDS *mds);
+ bool has_expired(MDS *mds);
+ void expire(MDS *mds, Context *c);
void replay(MDS *mds);
};
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef __MDS_EUPDATE_H
+#define __MDS_EUPDATE_H
+
+#include "../LogEvent.h"
+#include "EMetaBlob.h"
+
+class EUpdate : public LogEvent {
+public:
+ EMetaBlob metablob;
+ string type;
+
+ EUpdate() : LogEvent(EVENT_UPDATE) { }
+ EUpdate(const char *s) : LogEvent(EVENT_UPDATE),
+ type(s) { }
+
+ void print(ostream& out) {
+ if (type.length())
+ out << type << " ";
+ out << metablob;
+ }
+
+ void encode_payload(bufferlist& bl) {
+ ::_encode(type, bl);
+ metablob._encode(bl);
+ }
+ void decode_payload(bufferlist& bl, int& off) {
+ ::_decode(type, bl, off);
+ metablob._decode(bl, off);
+ }
+
+ bool has_expired(MDS *mds);
+ void expire(MDS *mds, Context *c);
+ void replay(MDS *mds);
+};
+
+#endif
*
*/
-#include "events/ETrace.h"
-#include "events/EMknod.h"
-#include "events/EMkdir.h"
-#include "events/EInodeUpdate.h"
+#include "events/EString.h"
+
+#include "events/EMetaBlob.h"
+#include "events/EAlloc.h"
+#include "events/EUpdate.h"
+#include "events/EImportMap.h"
+
#include "events/EPurgeFinish.h"
#include "events/EUnlink.h"
+#include "events/EExportStart.h"
+#include "events/EExportFinish.h"
+#include "events/EImportStart.h"
+#include "events/EImportFinish.h"
#include "MDS.h"
+#include "MDLog.h"
#include "MDCache.h"
+#include "MDStore.h"
+#include "Migrator.h"
#include "config.h"
#undef dout
-#define dout(l) if (l<=g_conf.debug || l <= g_conf.debug_mds) cout << g_clock.now() << " mds" << mds->get_nodeid() << ".journal "
-#define derr(l) if (l<=g_conf.debug || l <= g_conf.debug_mds) cout << g_clock.now() << " mds" << mds->get_nodeid() << ".journal "
+#define dout(l) if (l<=g_conf.debug_mds || l <= g_conf.debug_mds_log) cout << g_clock.now() << " mds" << mds->get_nodeid() << ".journal "
+#define derr(l) if (l<=g_conf.debug_mds || l <= g_conf.debug_mds_log) cout << g_clock.now() << " mds" << mds->get_nodeid() << ".journal "
// -----------------------
-// ETrace
+// EString
-CInode *ETrace::restore_trace(MDS *mds)
+bool EString::has_expired(MDS *mds) {
+ dout(10) << "EString.has_expired " << event << endl;
+ return true;
+}
+void EString::expire(MDS *mds, Context *c)
{
- CInode *in = 0;
- for (list<bit>::iterator p = trace.begin();
- p != trace.end();
- ++p) {
- // the dir
- CInode *diri = mds->mdcache->get_inode(p->dirino);
- if (!diri) {
- dout(10) << "ETrace.restore_trace adding dir " << p->dirino << endl;
- diri = new CInode(mds->mdcache);
- diri->inode.ino = p->dirino;
- diri->inode.mode = INODE_MODE_DIR;
- mds->mdcache->add_inode(diri);
-
- CDir *dir = diri->get_or_open_dir(mds);
-
- // root? import?
- if (p == trace.begin()) {
- mds->mdcache->add_import(dir);
- if (dir->ino() == 1)
- mds->mdcache->set_root(diri);
- }
- } else {
- dout(20) << "ETrace.restore_trace had dir " << p->dirino << endl;
- diri->get_or_open_dir(mds);
- }
- assert(diri->dir);
- dout(20) << "ETrace.restore_trace dir is " << *diri->dir << endl;
-
- // the inode
- in = mds->mdcache->get_inode(p->inode.ino);
- if (!in) {
- dout(10) << "ETrace.restore_trace adding dn '" << p->dn << "' inode " << p->inode.ino << endl;
- in = new CInode(mds->mdcache);
- in->inode = p->inode;
- mds->mdcache->add_inode(in);
-
- // the dentry
- CDentry *dn = diri->dir->add_dentry( p->dn, in );
- dn->mark_dirty();
- assert(dn);
- } else {
- dout(20) << "ETrace.restore_trace had dn '" << p->dn << "' inode " << p->inode.ino << endl;
- in->inode = p->inode;
- }
- dout(20) << "ETrace.restore_trace in is " << *in << endl;
- }
- return in;
+ dout(10) << "EString.expire " << event << endl;
+}
+void EString::replay(MDS *mds)
+{
+ dout(10) << "EString.replay " << event << endl;
}
+
// -----------------------
-// EMkdir
-// - trace goes to new dir's inode.
+// EMetaBlob
-bool EMkdir::can_expire(MDS *mds)
+/*
+ * we need to ensure that a journaled item has either
+ *
+ * - been safely committed to its dirslice.
+ *
+ * - has been safely exported. note that !is_auth() && !is_proxy()
+ * implies safely exported. if !is_auth() && is_proxy(), we need to
+ * add a waiter for the export to complete.
+ *
+ */
+bool EMetaBlob::has_expired(MDS *mds)
{
- // am i obsolete?
- CInode *in = mds->mdcache->get_inode( trace.back().inode.ino );
- if (!in) return true;
- CDir *dir = in->dir;
- if (!dir) return true;
- CDir *pdir = in->get_parent_dir();
- assert(pdir);
-
- dout(10) << "EMkdir.can_expire in is " << *in << endl;
- dout(10) << "EMkdir.can_expire inv is " << trace.back().inode.version << endl;
- dout(10) << "EMkdir.can_expire dir is " << *dir << endl;
- bool commitparent = in->get_last_committed_version() < trace.back().inode.version;
- bool commitnew = dir->get_last_committed_version() == 0;
+ // examine dirv's for my lumps
+ for (map<inodeno_t,dirlump>::iterator lp = lump_map.begin();
+ lp != lump_map.end();
+ ++lp) {
+ CInode *diri = mds->mdcache->get_inode(lp->first);
+ if (!diri)
+ continue; // we expired it
+ CDir *dir = diri->dir;
+ if (!dir)
+ continue; // we expired it
+
+ // FIXME: check the slice only
+
+ if (dir->is_proxy()) {
+ dout(10) << "EMetaBlob.has_expired am proxy, needed dirv " << lp->second.dirv
+ << " for " << *dir << endl;
+ return false; // we need to wait until the export flushes!
+ }
+ if (!dir->is_auth()) {
+ dout(10) << "EMetaBlob.has_expired not auth, needed dirv " << lp->second.dirv
+ << " for " << *dir << endl;
+ continue; // not our problem
+ }
- if (commitparent || commitnew) return false;
- return true;
+ if (dir->get_last_committed_version() < lp->second.dirv) {
+ dout(10) << "EMetaBlob.has_expired need dirv " << lp->second.dirv
+ << " for " << *dir << endl;
+ return false; // not committed.
+ } else {
+ dout(10) << "EMetaBlob.has_expired have dirv " << lp->second.dirv
+ << " for " << *dir << endl;
+ }
+ }
+
+ return true; // all dirlumps expired.
}
-void EMkdir::retire(MDS *mds, Context *c)
+void EMetaBlob::expire(MDS *mds, Context *c)
{
- // commit parent dir AND my dir
- CInode *in = mds->mdcache->get_inode( trace.back().inode.ino );
- assert(in);
- CDir *dir = in->dir;
- assert(dir);
- CDir *pdir = in->get_parent_dir();
- assert(pdir);
-
- dout(10) << "EMkdir.retire in is " << *in << endl;
- dout(10) << "EMkdir.retire inv is " << trace.back().inode.version << endl;
- dout(10) << "EMkdir.retire dir is " << *dir << endl;
- bool commitparent = in->get_last_committed_version() < trace.back().inode.version;
- bool commitnew = dir->get_last_committed_version() == 0;
-
- if (commitparent && commitnew) {
- // both
- dout(10) << "EMkdir.retire committing parent+new dir " << *dir << endl;
- C_Gather *gather = new C_Gather(c);
- mds->mdstore->commit_dir(pdir, gather->new_sub());
- mds->mdstore->commit_dir(dir, gather->new_sub());
- } else if (commitparent) {
- // just parent
- dout(10) << "EMkdir.retire committing parent dir " << *dir << endl;
- mds->mdstore->commit_dir(pdir, c);
+ list<CDir*> commit;
+ list<CDir*> waitfor_export;
+ int ncommit = 0;
+
+ // examine dirv's for my lumps
+ // make list of dir slices i need to commit
+ for (map<inodeno_t,dirlump>::iterator lp = lump_map.begin();
+ lp != lump_map.end();
+ ++lp) {
+ CInode *diri = mds->mdcache->get_inode(lp->first);
+ if (!diri)
+ continue; // we expired it
+ CDir *dir = diri->dir;
+ if (!dir)
+ continue; // we expired it
+
+ // FIXME: check the slice only
+
+ if (dir->is_proxy()) {
+ // wait until export is acked (logged on remote) and committed (logged locally)
+ CDir *ex = mds->mdcache->get_export_container(dir);
+ dout(10) << "EMetaBlob.expire proxy for " << *dir
+ << ", waiting for export finish on " << *ex << endl;
+ waitfor_export.push_back(ex);
+ continue;
+ }
+ if (!dir->is_auth()) {
+ dout(10) << "EMetaBlob.expire not auth, needed dirv " << lp->second.dirv
+ << " for " << *dir << endl;
+ continue; // not our problem
+ }
+ if (dir->get_last_committed_version() < lp->second.dirv) {
+ dout(10) << "EMetaBlob.expire need dirv " << lp->second.dirv
+ << ", committing " << *dir << endl;
+ commit.push_back(dir);
+ ncommit++;
+ } else {
+ dout(10) << "EMetaBlob.expire have dirv " << lp->second.dirv
+ << " on " << *dir << endl;
+ }
+ }
+
+ // commit
+ assert(!commit.empty());
+
+ if (ncommit == 1) {
+ mds->mdstore->commit_dir(commit.front(), c);
} else {
- // just new dir
- dout(10) << "EMkdir.retire committing new dir " << *dir << endl;
- mds->mdstore->commit_dir(dir, c);
+ C_Gather *gather = new C_Gather(c);
+ for (list<CDir*>::iterator p = commit.begin();
+ p != commit.end();
+ ++p)
+ mds->mdstore->commit_dir(*p, gather->new_sub());
+ for (list<CDir*>::iterator p = waitfor_export.begin();
+ p != waitfor_export.end();
+ ++p)
+ mds->mdcache->migrator->add_export_finish_waiter(*p, gather->new_sub());
}
}
-bool EMkdir::has_happened(MDS *mds)
-{
- return false;
-}
-
-void EMkdir::replay(MDS *mds)
+void EMetaBlob::replay(MDS *mds)
{
- dout(10) << "EMkdir.replay " << *this << endl;
- CInode *in = trace.restore_trace(mds);
+ dout(10) << "EMetaBlob.replay " << lump_map.size() << " dirlumps" << endl;
- // mark dir inode dirty
- in->mark_dirty();
+ // walk through my dirs (in order!)
+ for (list<inodeno_t>::iterator lp = lump_order.begin();
+ lp != lump_order.end();
+ ++lp) {
+ dout(10) << "EMetaBlob.replay dir " << *lp << endl;
+ dirlump &lump = lump_map[*lp];
- // mark parent dir dirty, and set version.
- // this may end up being below water when dir is fetched from disk.
- CDir *pdir = in->get_parent_dir();
- if (!pdir->is_dirty()) pdir->mark_dirty();
- pdir->set_version(trace.back().dirv);
-
- // mark new dir dirty + complete
- CDir *dir = in->get_or_open_dir(mds);
- dir->mark_dirty();
- dir->mark_complete();
-}
+ // the dir
+ CInode *diri = mds->mdcache->get_inode(*lp);
+ CDir *dir;
+ if (!diri) {
+ assert(*lp == 1);
+ diri = mds->mdcache->create_root_inode();
+ dout(10) << "EMetaBlob.replay created root " << *diri << endl;
+ }
+ if (diri->dir) {
+ dir = diri->dir;
+ dout(20) << "EMetaBlob.replay had dir " << *dir << endl;
+ } else {
+ dir = diri->get_or_open_dir(mds->mdcache);
+ if (*lp == 1)
+ dir->set_dir_auth(CDIR_AUTH_UNKNOWN);
+ dout(10) << "EMetaBlob.replay added dir " << *dir << endl;
+ }
+ dir->set_version( lump.dirv );
+ if (lump.is_dirty())
+ dir->_mark_dirty();
+ if (lump.is_complete())
+ dir->mark_complete();
+
+ // decode bits
+ lump._decode_bits();
+
+ // full dentry+inode pairs
+ for (list<fullbit>::iterator p = lump.get_dfull().begin();
+ p != lump.get_dfull().end();
+ p++) {
+ CInode *in = mds->mdcache->get_inode(p->inode.ino);
+ if (!in) {
+ // inode
+ in = new CInode(mds->mdcache);
+ in->inode = p->inode;
+ if (in->inode.is_symlink()) in->symlink = p->symlink;
+ mds->mdcache->add_inode(in);
+ // dentry
+ CDentry *dn = dir->add_dentry( p->dn, in );
+ dn->set_version(p->dnv);
+ dn->_mark_dirty();
+ dout(10) << "EMetaBlob.replay added " << *dn << " " << *in << endl;
+ } else {
+ // inode
+ in->inode = p->inode;
+ if (in->inode.is_symlink()) in->symlink = p->symlink;
+ // dentry
+ CDentry *dn = in->get_parent_dn();
+ dn->set_version(p->dnv);
+ dn->_mark_dirty();
+ dout(10) << "EMetaBlob.replay had " << *in->get_parent_dn() << " " << *in << endl;
+ }
+ }
+ // remote dentries
+ for (list<remotebit>::iterator p = lump.get_dremote().begin();
+ p != lump.get_dremote().end();
+ p++) {
+ CDentry *dn = dir->lookup(p->dn);
+ if (!dn) {
+ dn = dir->add_dentry(p->dn, p->ino);
+ dn->set_remote_ino(p->ino);
+ dn->set_version(p->dnv);
+ dn->_mark_dirty();
+ dout(10) << "EMetaBlob.replay added " << *dn << endl;
+ } else {
+ dn->set_remote_ino(p->ino);
+ dn->set_version(p->dnv);
+ dn->_mark_dirty();
+ dout(10) << "EMetaBlob.replay had " << *dn << endl;
+ }
+ }
+ // null dentries
+ for (list<nullbit>::iterator p = lump.get_dnull().begin();
+ p != lump.get_dnull().end();
+ p++) {
+ CDentry *dn = dir->lookup(p->dn);
+ if (!dn) {
+ dn = dir->add_dentry(p->dn);
+ dn->set_version(p->dnv);
+ dn->_mark_dirty();
+ dout(10) << "EMetaBlob.replay added " << *dn << endl;
+ } else {
+ dn->set_version(p->dnv);
+ dn->_mark_dirty();
+ dout(10) << "EMetaBlob.replay had " << *dn << endl;
+ }
+ }
+ }
+}
-// -----------------------
-// EMknod
-
-bool EMknod::can_expire(MDS *mds)
-{
- // am i obsolete?
- CInode *in = mds->mdcache->get_inode( trace.back().inode.ino );
- if (!in) return true;
- if (!in->is_auth()) return true; // not my inode anymore!
- if (in->get_version() != trace.back().inode.version)
- return true; // i'm obsolete! (another log entry follows)
- if (in->get_last_committed_version() >= trace.back().inode.version)
- return true;
+// -----------------------
+// EAlloc
- return false;
+bool EAlloc::has_expired(MDS *mds)
+{
+ version_t cv = mds->idalloc->get_committed_version();
+ if (cv < table_version) {
+ dout(10) << "EAlloc.has_expired v " << table_version << " > " << cv
+ << ", still dirty" << endl;
+ return false; // still dirty
+ } else {
+ dout(10) << "EAlloc.has_expired v " << table_version << " <= " << cv
+ << ", already flushed" << endl;
+ return true; // already flushed
+ }
}
-void EMknod::retire(MDS *mds, Context *c)
+void EAlloc::expire(MDS *mds, Context *c)
{
- // commit parent directory
- CInode *diri = mds->mdcache->get_inode( trace.back().dirino );
- assert(diri);
- CDir *dir = diri->dir;
- assert(dir);
-
- dout(10) << "EMknod.retire committing parent dir " << *dir << endl;
- mds->mdstore->commit_dir(dir, c);
+ dout(10) << "EAlloc.expire saving idalloc table" << endl;
+ mds->idalloc->save(c, table_version);
}
-bool EMknod::has_happened(MDS *mds)
+void EAlloc::replay(MDS *mds)
{
- return false;
+ if (mds->idalloc->get_version() >= table_version) {
+ dout(10) << "EAlloc.replay event " << table_version
+ << " <= table " << mds->idalloc->get_version() << endl;
+ } else {
+ dout(10) << " EAlloc.replay event " << table_version
+ << " - 1 == table " << mds->idalloc->get_version() << endl;
+ assert(table_version-1 == mds->idalloc->get_version());
+
+ if (what == EALLOC_EV_ALLOC) {
+ idno_t nid = mds->idalloc->alloc_id(true);
+ assert(nid == id); // this should match.
+ }
+ else if (what == EALLOC_EV_FREE) {
+ mds->idalloc->reclaim_id(id, true);
+ }
+ else
+ assert(0);
+
+ assert(table_version == mds->idalloc->get_version());
+ }
}
-
-void EMknod::replay(MDS *mds)
-{
- dout(10) << "EMknod.replay " << *this << endl;
- CInode *in = trace.restore_trace(mds);
- in->mark_dirty();
- // mark parent dir dirty, and set version.
- // this may end up being below water when dir is fetched from disk.
- CDir *pdir = in->get_parent_dir();
- if (!pdir->is_dirty()) pdir->mark_dirty();
- pdir->set_version(trace.back().dirv);
-}
+// -----------------------
+// EUpdate
+bool EUpdate::has_expired(MDS *mds)
+{
+ return metablob.has_expired(mds);
+}
-// -----------------------
-// EInodeUpdate
+void EUpdate::expire(MDS *mds, Context *c)
+{
+ metablob.expire(mds, c);
+}
-bool EInodeUpdate::can_expire(MDS *mds)
+void EUpdate::replay(MDS *mds)
{
- CInode *in = mds->mdcache->get_inode( trace.back().inode.ino );
- if (!in) return true;
+ metablob.replay(mds);
+}
- if (!in->is_auth()) return true; // not my inode anymore!
- if (in->get_version() != trace.back().inode.version)
- return true; // i'm obsolete! (another log entry follows)
- /*
- // frozen -> exporting -> obsolete (FOR NOW?)
- if (in->is_frozen())
- return true;
- */
+// -----------------------
+// EImportMap
- if (in->get_last_committed_version() >= trace.back().inode.version)
+bool EImportMap::has_expired(MDS *mds)
+{
+ if (mds->mdlog->last_import_map > get_end_off()) {
+ dout(10) << "EImportMap.has_expired -- there's a newer map" << endl;
return true;
-
- return false;
+ }
+ else if (mds->mdlog->is_capped()) {
+ dout(10) << "EImportMap.has_expired -- log is capped, allowing map to expire" << endl;
+ return true;
+ } else {
+ dout(10) << "EImportMap.has_expired -- not until there's a newer map written" << endl;
+ return false;
+ }
}
-void EInodeUpdate::retire(MDS *mds, Context *c)
-{
- // commit parent directory
- CInode *diri = mds->mdcache->get_inode( trace.back().dirino );
- assert(diri);
- CDir *dir = diri->dir;
- assert(dir);
+/*
+class C_MDS_ImportMapFlush : public Context {
+ MDS *mds;
+ off_t end_off;
+public:
+ C_MDS_ImportMapFlush(MDS *m, off_t eo) : mds(m), end_off(eo) { }
+ void finish(int r) {
+ // am i the last thing in the log?
+ if (mds->mdlog->get_write_pos() == end_off) {
+ // yes. we're good.
+ } else {
+ // no. submit another import_map so that we can go away.
+ }
+ }
+};
+*/
- dout(10) << "EMknod.retire committing parent dir " << *dir << endl;
- mds->mdstore->commit_dir(dir, c);
-}
-
-bool EInodeUpdate::has_happened(MDS *mds)
+void EImportMap::expire(MDS *mds, Context *c)
{
- return false;
+ dout(10) << "EImportMap.has_expire -- waiting for a newer map to be written (or for shutdown)" << endl;
+ mds->mdlog->import_map_expire_waiters.push_back(c);
}
-void EInodeUpdate::replay(MDS *mds)
+void EImportMap::replay(MDS *mds)
{
- dout(10) << "EInodeUpdate.replay " << *this << endl;
- CInode *in = trace.restore_trace(mds);
- in->mark_dirty();
+ dout(10) << "EImportMap.replay -- reconstructing import/export spanning tree" << endl;
+ assert(mds->mdcache->imports.empty());
- // mark parent dir dirty, and set version.
- // this may end up being below water when dir is fetched from disk.
- CDir *pdir = in->get_parent_dir();
- if (!pdir->is_dirty()) pdir->mark_dirty();
- pdir->set_version(trace.back().dirv);
+ // first, stick the spanning tree in my cache
+ metablob.replay(mds);
+
+ // restore import/export maps
+ for (set<inodeno_t>::iterator p = imports.begin();
+ p != imports.end();
+ ++p) {
+ mds->mdcache->add_ambiguous_import(*p, nested_exports[*p]);
+ mds->mdcache->finish_ambiguous_import(*p);
+ }
+
+ mds->mdcache->show_imports();
}
// -----------------------
// EUnlink
-bool EUnlink::can_expire(MDS *mds)
+bool EUnlink::has_expired(MDS *mds)
{
+ /*
// dir
CInode *diri = mds->mdcache->get_inode( diritrace.back().inode.ino );
CDir *dir = 0;
if (in && in->get_last_committed_version() < inodetrace.back().inode.version)
return false;
}
-
+ */
return true;
}
-void EUnlink::retire(MDS *mds, Context *c)
+void EUnlink::expire(MDS *mds, Context *c)
{
+ /*
CInode *diri = mds->mdcache->get_inode( diritrace.back().inode.ino );
CDir *dir = diri->dir;
assert(dir);
// okay!
dout(7) << "commiting dirty (from unlink) dir " << *dir << endl;
mds->mdstore->commit_dir(dir, dirv, c);
+ */
}
-bool EUnlink::has_happened(MDS *mds)
+void EUnlink::replay(MDS *mds)
+{
+}
+
+
+
+
+// -----------------------
+// EPurgeFinish
+
+
+bool EPurgeFinish::has_expired(MDS *mds)
{
return true;
}
-void EUnlink::replay(MDS *mds)
+void EPurgeFinish::expire(MDS *mds, Context *c)
{
}
+void EPurgeFinish::replay(MDS *mds)
+{
+}
+
+
+// =========================================================================
+
// -----------------------
-// EPurgeFinish
+// EExportStart
+bool EExportStart::has_expired(MDS *mds)
+{
+ CInode *diri = mds->mdcache->get_inode(dirino);
+ if (!diri) return true;
+ CDir *dir = diri->dir;
+ if (!dir) return true;
+ if (!mds->mdcache->migrator->is_exporting(dir))
+ return true;
+ dout(10) << "EExportStart.has_expired still exporting " << *dir << endl;
+ return false;
+}
-bool EPurgeFinish::can_expire(MDS *mds)
+void EExportStart::expire(MDS *mds, Context *c)
{
+ CInode *diri = mds->mdcache->get_inode(dirino);
+ assert(diri);
+ CDir *dir = diri->dir;
+ assert(dir);
+ assert(mds->mdcache->migrator->is_exporting(dir));
+
+ dout(10) << "EExportStart.expire waiting for export of " << *dir << endl;
+ mds->mdcache->migrator->add_export_finish_waiter(dir, c);
+}
+
+void EExportStart::replay(MDS *mds)
+{
+ dout(10) << "EExportStart.replay " << dirino << " -> " << dest << endl;
+ metablob.replay(mds);
+
+ // put in pending_exports lists
+ mds->mdlog->pending_exports[dirino] = bounds;
+}
+
+// -----------------------
+// EExportFinish
+
+bool EExportFinish::has_expired(MDS *mds)
+{
+ // we can always expire.
return true;
}
-void EPurgeFinish::retire(MDS *mds, Context *c)
+void EExportFinish::expire(MDS *mds, Context *c)
{
+ assert(0); // should never happen.
}
-bool EPurgeFinish::has_happened(MDS *mds)
+void EExportFinish::replay(MDS *mds)
+{
+ dout(10) << "EExportFinish.replay " << dirino << " success=" << success << endl;
+
+ assert(mds->mdlog->pending_exports.count(dirino));
+
+ // finish?
+ if (success)
+ mds->mdcache->finish_ambiguous_export(dirino, mds->mdlog->pending_exports[dirino]);
+
+ // remove from pending_exports list
+ mds->mdlog->pending_exports.erase(dirino);
+}
+
+
+// -----------------------
+// EImportStart
+
+bool EImportStart::has_expired(MDS *mds)
+{
+ return metablob.has_expired(mds);
+}
+
+void EImportStart::expire(MDS *mds, Context *c)
+{
+ dout(10) << "EImportStart.expire " << dirino << endl;
+ metablob.expire(mds, c);
+}
+
+void EImportStart::replay(MDS *mds)
+{
+ dout(10) << "EImportStart.replay " << dirino << endl;
+ metablob.replay(mds);
+
+ // convert list -> set
+ set<inodeno_t> b;
+ for (list<inodeno_t>::iterator p = bounds.begin(); p != bounds.end(); ++p)
+ b.insert(*p);
+
+ // put in ambiguous import list
+ mds->mdcache->add_ambiguous_import(dirino, b);
+}
+
+// -----------------------
+// EImportFinish
+
+bool EImportFinish::has_expired(MDS *mds)
{
return true;
}
+void EImportFinish::expire(MDS *mds, Context *c)
+{
+ assert(0); // shouldn't ever happen
+}
-void EPurgeFinish::replay(MDS *mds)
+void EImportFinish::replay(MDS *mds)
{
+ dout(10) << "EImportFinish.replay " << dirino << " success=" << success << endl;
+ if (success)
+ mds->mdcache->finish_ambiguous_import(dirino);
+ else
+ mds->mdcache->cancel_ambiguous_import(dirino);
}
+
#include <math.h>
#include <ostream>
+#include <set>
+#include <map>
using namespace std;
#include "config.h"
#include <cassert>
+
+// md ops
+#define MDS_OP_STATFS 1
+
+#define MDS_OP_STAT 100
+#define MDS_OP_LSTAT 101
+#define MDS_OP_UTIME 102
+#define MDS_OP_CHMOD 103
+#define MDS_OP_CHOWN 104
+
+
+#define MDS_OP_READDIR 200
+#define MDS_OP_MKNOD 201
+#define MDS_OP_LINK 202
+#define MDS_OP_UNLINK 203
+#define MDS_OP_RENAME 204
+
+#define MDS_OP_MKDIR 220
+#define MDS_OP_RMDIR 221
+#define MDS_OP_SYMLINK 222
+
+#define MDS_OP_OPEN 301
+#define MDS_OP_TRUNCATE 306
+#define MDS_OP_FSYNC 307
+//#define MDS_OP_CLOSE 310
+#define MDS_OP_RELEASE 308
+
+
+
+// ================================================================
+
/* meta_load_t
* hierarchical load for an inode/dir and it's children
*/
inline ostream& operator<<( ostream& out, meta_load_t& load )
{
- return out << "metaload<rd " << load.pop[META_POP_IRD].get()
+ return out << "<rd " << load.pop[META_POP_IRD].get()
<< ", wr " << load.pop[META_POP_IWR].get()
<< ">";
}
*/
+// ================================================================
+// dir slices
+
+struct dirslice_t {
+ short hash_mask;
+ short hash_val;
+};
+
+
+
+// ================================================================
+
+#define MDS_PIN_REPLICATED 1
+
+class MDSCacheObject {
+ protected:
+ unsigned state; // state bits
+
+ int ref; // reference count
+ set<int> ref_set;
+
+ map<int,int> replicas; // [auth] mds -> nonce
+ int replica_nonce; // [replica] defined on replica
+
+ public:
+ MDSCacheObject() :
+ state(0),
+ ref(0),
+ replica_nonce(0) {}
+ virtual ~MDSCacheObject() {}
+
+ // --------------------------------------------
+ // state
+ unsigned get_state() { return state; }
+ void state_clear(unsigned mask) { state &= ~mask; }
+ void state_set(unsigned mask) { state |= mask; }
+ unsigned state_test(unsigned mask) { return state & mask; }
+ void state_reset(unsigned s) { state = s; }
+
+ // --------------------------------------------
+ // pins
+ int get_num_ref() { return ref; }
+ bool is_pinned_by(int by) { return ref_set.count(by); }
+ set<int>& get_ref_set() { return ref_set; }
+
+ virtual void last_put() {}
+ virtual void bad_put(int by) {
+ assert(ref_set.count(by) == 1);
+ assert(ref > 0);
+ }
+ void put(int by) {
+ if (ref == 0 || ref_set.count(by) != 1) {
+ bad_put(by);
+ } else {
+ ref--;
+ ref_set.erase(by);
+ assert(ref == (int)ref_set.size());
+ if (ref == 0)
+ last_put();
+ }
+ }
+
+ virtual void first_get() {}
+ virtual void bad_get(int by) {
+ assert(ref_set.count(by) == 0);
+ assert(0);
+ }
+ void get(int by) {
+ if (ref_set.count(by)) {
+ bad_get(by);
+ } else {
+ if (ref == 0)
+ first_get();
+ ref++;
+ ref_set.insert(by);
+ assert(ref == (int)ref_set.size());
+ }
+ }
+
+
+
+ // --------------------------------------------
+ // replication
+ bool is_replicated() { return !replicas.empty(); }
+ bool is_replica(int mds) { return replicas.count(mds); }
+ int num_replicas() { return replicas.size(); }
+ int add_replica(int mds) {
+ if (replicas.count(mds))
+ return ++replicas[mds]; // inc nonce
+ if (replicas.empty())
+ get(MDS_PIN_REPLICATED);
+ return replicas[mds] = 1;
+ }
+ void add_replica(int mds, int nonce) {
+ if (replicas.empty())
+ get(MDS_PIN_REPLICATED);
+ replicas[mds] = nonce;
+ }
+ int get_replica_nonce(int mds) {
+ assert(replicas.count(mds));
+ return replicas[mds];
+ }
+ void remove_replica(int mds) {
+ assert(replicas.count(mds));
+ replicas.erase(mds);
+ if (replicas.empty())
+ put(MDS_PIN_REPLICATED);
+ }
+ void clear_replicas() {
+ if (!replicas.empty())
+ put(MDS_PIN_REPLICATED);
+ replicas.clear();
+ }
+ map<int,int>::iterator replicas_begin() { return replicas.begin(); }
+ map<int,int>::iterator replicas_end() { return replicas.end(); }
+ const map<int,int>& get_replicas() { return replicas; }
+
+ int get_replica_nonce() { return replica_nonce;}
+ void set_replica_nonce(int n) { replica_nonce = n; }
+};
+
+
#endif
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-
-
-/*
-
-
-OLD LOCK CRAP:
- (old):
- sync - soft metadata.. no reads/writes can proceed. (eg no stat)
- lock - hard(+soft) metadata.. path traversals stop etc. (??)
-
-
- replication consistency modes:
- hard+soft - hard and soft are defined on all replicas.
- all reads proceed (in absense of sync lock)
- writes require sync lock, fw to auth
- -> normal behavior.
-
- hard - hard only, soft is undefined
- reads require a sync
- writes proceed if field updates are monotonic (e.g. size, m/c/atime)
- -> 'softasync'
-
- types of access by cache users:
-
- hard soft
- R - read_hard_try path traversal
- R <= R read_soft_start stat
- R <= W write_soft_start touch
- W => W write_hard_start chmod
-
- note on those implications:
- read_soft_start() calls read_hard_try()
- write_soft_start() calls read_hard_try()
- a hard lock implies/subsumes a soft sync (read_soft_start() returns true if a
- lock is held)
-
-
- relationship with frozen directories:
-
- read_hard_try - can proceed, because any hard changes require a lock, which
- requires an active authority, which implies things are unfrozen.
- write_hard_start - waits (has to; only auth can initiate)
- read_soft_start - ???? waits for now. (FIXME: if !softasync & !syncbyauth)
- write_soft_start - ???? waits for now. (FIXME: if (softasync & !syncbyauth))
-
- if sticky is on, an export_dir will drop any sync or lock so that the freeze will
- proceed (otherwise, deadlock!). likewise, a sync will not stick if is_freezing().
-
-
-
-NAMESPACE:
-
- none right now.
-
-
-*/
-
-
-/* soft sync locks: mtime, size, etc.
- */
-
-bool MDCache::read_soft_start(CInode *in, Message *m)
-{
- // if (!read_hard_try(in, m))
- // return false;
-
- // if frozen: i can't proceed (for now, see above)
- if (in->is_frozen()) {
- dout(7) << "read_soft_start " << *in << " is frozen, waiting" << endl;
- in->add_waiter(CDIR_WAIT_UNFREEZE,
- new C_MDS_RetryMessage(mds, m));
- return false;
- }
-
-
- dout(5) << "read_soft_start " << *in << endl;
-
- // what soft sync mode?
-
- if (in->is_softasync()) {
- // softasync: hard consistency only
-
- if (in->is_auth()) {
- // i am auth: i need sync
- if (in->is_syncbyme()) goto yes;
- if (in->is_lockbyme()) goto yes; // lock => sync
- if (!in->is_cached_by_anyone() &&
- !in->is_open_write()) goto yes; // i'm alone
- } else {
- // i am replica: fw to auth
- int auth = in->authority();
- dout(5) << "read_soft_start " << *in << " is softasync, fw to auth " << auth << endl;
- assert(auth != mds->get_nodeid());
- mds->messenger->send_message(m,
- MSG_ADDR_MDS(auth), m->get_dest_port(),
- MDS_PORT_CACHE);
- return false;
- }
- } else {
- // normal: soft+hard consistency
-
- if (in->is_syncbyauth()) {
- // wait for sync
- } else {
- // i'm consistent
- goto yes;
- }
- }
-
- // we need sync
- if (in->is_syncbyauth() && !in->is_softasync()) {
- dout(5) << "read_soft_start " << *in << " is normal+replica+syncbyauth" << endl;
- } else if (in->is_softasync() && in->is_auth()) {
- dout(5) << "read_soft_start " << *in << " is softasync+auth, waiting on sync" << endl;
- } else
- assert(2+2==5);
-
- if (!in->can_auth_pin()) {
- dout(5) << "read_soft_start " << *in << " waiting to auth_pin" << endl;
- in->add_waiter(CINODE_WAIT_AUTHPINNABLE,
- new C_MDS_RetryMessage(mds,m));
- return false;
- }
-
- if (in->is_auth()) {
- // wait for sync
- in->add_waiter(CINODE_WAIT_SYNC,
- new C_MDS_RetryMessage(mds, m));
-
- if (!in->is_presync())
- inode_sync_start(in);
- } else {
- // wait for unsync
- in->add_waiter(CINODE_WAIT_UNSYNC,
- new C_MDS_RetryMessage(mds, m));
-
- assert(in->is_syncbyauth());
-
- if (!in->is_waitonunsync())
- inode_sync_wait(in);
- }
-
- return false;
-
- yes:
- mds->balancer->hit_inode(in, MDS_POP_SOFTRD);
- mds->balancer->hit_inode(in, MDS_POP_ANY);
- return true;
-}
-
-
-int MDCache::read_soft_finish(CInode *in)
-{
- dout(5) << "read_soft_finish " << *in << endl; // " soft_sync_count " << in->soft_sync_count << endl;
- return 0; // do nothing, actually..
-}
-
-
-bool MDCache::write_soft_start(CInode *in, Message *m)
-{
- // if (!read_hard_try(in, m))
- //return false;
-
- // if frozen: i can't proceed (for now, see above)
- if (in->is_frozen()) {
- dout(7) << "read_soft_start " << *in << " is frozen, waiting" << endl;
- in->add_waiter(CDIR_WAIT_UNFREEZE,
- new C_MDS_RetryMessage(mds, m));
- return false;
- }
-
- dout(5) << "write_soft_start " << *in << endl;
- // what soft sync mode?
-
- if (in->is_softasync()) {
- // softasync: hard consistency only
-
- if (in->is_syncbyauth()) {
- // wait for sync release
- } else {
- // i'm inconsistent; write away!
- goto yes;
- }
-
- } else {
- // normal: soft+hard consistency
-
- if (in->is_auth()) {
- // i am auth: i need sync
- if (in->is_syncbyme()) goto yes;
- if (in->is_lockbyme()) goto yes; // lock => sync
- if (!in->is_cached_by_anyone() &&
- !in->is_open_write()) goto yes; // i'm alone
- } else {
- // i am replica: fw to auth
- int auth = in->authority();
- dout(5) << "write_soft_start " << *in << " is !softasync, fw to auth " << auth << endl;
- assert(auth != mds->get_nodeid());
- mds->messenger->send_message(m,
- MSG_ADDR_MDS(auth), m->get_dest_port(),
- MDS_PORT_CACHE);
- return false;
- }
- }
-
- // we need sync
- if (in->is_syncbyauth() && in->is_softasync() && !in->is_auth()) {
- dout(5) << "write_soft_start " << *in << " is softasync+replica+syncbyauth" << endl;
- } else if (!in->is_softasync() && in->is_auth()) {
- dout(5) << "write_soft_start " << *in << " is normal+auth, waiting on sync" << endl;
- } else
- assert(2+2==5);
-
- if (!in->can_auth_pin()) {
- dout(5) << "write_soft_start " << *in << " waiting to auth_pin" << endl;
- in->add_waiter(CINODE_WAIT_AUTHPINNABLE,
- new C_MDS_RetryMessage(mds,m));
- return false;
- }
-
- if (in->is_auth()) {
- // wait for sync
- in->add_waiter(CINODE_WAIT_SYNC,
- new C_MDS_RetryMessage(mds, m));
-
- if (!in->is_presync())
- inode_sync_start(in);
- } else {
- // wait for unsync
- in->add_waiter(CINODE_WAIT_UNSYNC,
- new C_MDS_RetryMessage(mds, m));
-
- assert(in->is_syncbyauth());
- assert(in->is_softasync());
-
- if (!in->is_waitonunsync())
- inode_sync_wait(in);
- }
-
- return false;
-
- yes:
- mds->balancer->hit_inode(in, MDS_POP_SOFTWR);
- mds->balancer->hit_inode(in, MDS_POP_ANY);
- return true;
-}
-
-
-int MDCache::write_soft_finish(CInode *in)
-{
- dout(5) << "write_soft_finish " << *in << endl; //" soft_sync_count " << in->soft_sync_count << endl;
- return 0; // do nothing, actually..
-}
-
-
-
-
-
-
-
-
-/* hard locks: owner, mode
- */
-
-/*
-bool MDCache::read_hard_try(CInode *in,
- Message *m)
-{
- //dout(5) << "read_hard_try " << *in << endl;
-
- if (in->is_auth()) {
- // auth
- goto yes; // fine
- } else {
- // replica
- if (in->is_lockbyauth()) {
- // locked by auth; wait!
- dout(7) << "read_hard_try waiting on " << *in << endl;
- in->add_waiter(CINODE_WAIT_UNLOCK, new C_MDS_RetryMessage(mds, m));
- if (!in->is_waitonunlock())
- inode_lock_wait(in);
- return false;
- } else {
- // not locked.
- goto yes;
- }
- }
-
- yes:
- mds->balancer->hit_inode(in, MDS_POP_HARDRD);
- mds->balancer->hit_inode(in, MDS_POP_ANY);
- return true;
-}
-
-
-bool MDCache::write_hard_start(CInode *in,
- Message *m)
-{
- // if frozen: i can't proceed; only auth can initiate lock
- if (in->is_frozen()) {
- dout(7) << "write_hard_start " << *in << " is frozen, waiting" << endl;
- in->add_waiter(CDIR_WAIT_UNFREEZE,
- new C_MDS_RetryMessage(mds, m));
- return false;
- }
-
- // NOTE: if freezing, and locked, we must proceed, to avoid deadlock (where
- // the freeze is waiting for our lock to be released)
-
-
- if (in->is_auth()) {
- // auth
- if (in->is_lockbyme()) goto success;
- if (!in->is_cached_by_anyone()) goto success;
-
- // need lock
- if (!in->can_auth_pin()) {
- dout(5) << "write_hard_start " << *in << " waiting to auth_pin" << endl;
- in->add_waiter(CINODE_WAIT_AUTHPINNABLE, new C_MDS_RetryMessage(mds, m));
- return false;
- }
-
- in->add_waiter(CINODE_WAIT_LOCK, new C_MDS_RetryMessage(mds, m));
-
- if (!in->is_prelock())
- inode_lock_start(in);
-
- return false;
- } else {
- // replica
- // fw to auth
- int auth = in->authority();
- dout(5) << "write_hard_start " << *in << " on replica, fw to auth " << auth << endl;
- assert(auth != mds->get_nodeid());
- mds->messenger->send_message(m,
- MSG_ADDR_MDS(auth), m->get_dest_port(),
- MDS_PORT_CACHE);
- return false;
- }
-
- success:
- in->lock_active_count++;
- dout(5) << "write_hard_start " << *in << " count now " << in->lock_active_count << endl;
- assert(in->lock_active_count > 0);
-
- mds->balancer->hit_inode(in, MDS_POP_HARDWR);
- mds->balancer->hit_inode(in, MDS_POP_ANY);
- return true;
-}
-
-void MDCache::write_hard_finish(CInode *in)
-{
- in->lock_active_count--;
- dout(5) << "write_hard_finish " << *in << " count now " << in->lock_active_count << endl;
- assert(in->lock_active_count >= 0);
-
- // release lock?
- if (in->lock_active_count == 0 &&
- in->is_lockbyme() &&
- !g_conf.mdcache_sticky_lock) {
- dout(7) << "write_hard_finish " << *in << " !sticky, releasing lock immediately" << endl;
- inode_lock_release(in);
- }
-}
-
-
-void MDCache::inode_lock_start(CInode *in)
-{
- dout(5) << "lock_start on " << *in << ", waiting for " << in->cached_by << endl;
-
- assert(in->is_auth());
- assert(!in->is_prelock());
- assert(!in->is_lockbyme());
- assert(!in->is_lockbyauth());
-
- in->lock_waiting_for_ack = in->cached_by;
- in->dist_state |= CINODE_DIST_PRELOCK;
- in->get(CINODE_PIN_PRELOCK);
- in->auth_pin();
-
- // send messages
- for (set<int>::iterator it = in->cached_by_begin();
- it != in->cached_by_end();
- it++) {
- mds->messenger->send_message(new MInodeLockStart(in->inode.ino, mds->get_nodeid()),
- MSG_ADDR_MDS(*it), MDS_PORT_CACHE,
- MDS_PORT_CACHE);
- }
-}
-
-
-void MDCache::inode_lock_release(CInode *in)
-{
- dout(5) << "lock_release on " << *in << ", messages to " << in->get_cached_by() << endl;
-
- assert(in->is_lockbyme());
- assert(in->is_auth());
-
- in->dist_state &= ~CINODE_DIST_LOCKBYME;
-
- for (set<int>::iterator it = in->cached_by_begin();
- it != in->cached_by_end();
- it++) {
- mds->messenger->send_message(new MInodeLockRelease(in),
- MSG_ADDR_MDS(*it), MDS_PORT_CACHE,
- MDS_PORT_CACHE);
- }
-
- in->auth_unpin();
-}
-
-void MDCache::inode_lock_wait(CInode *in)
-{
- dout(5) << "lock_wait on " << *in << endl;
- assert(!in->is_auth());
- assert(in->is_lockbyauth());
-
- in->dist_state |= CINODE_DIST_WAITONUNLOCK;
- in->get(CINODE_PIN_WAITONUNLOCK);
-}
-
-
-void MDCache::handle_inode_lock_start(MInodeLockStart *m)
-{
- // authority is requesting a lock
- CInode *in = get_inode(m->get_ino());
- if (!in) {
- // don't have it anymore!
- dout(7) << "handle_lock_start " << m->get_ino() << ": don't have it anymore, nak" << endl;
- mds->messenger->send_message(new MInodeLockAck(m->get_ino(), false),
- MSG_ADDR_MDS(m->get_asker()), MDS_PORT_CACHE,
- MDS_PORT_CACHE);
- delete m; // done
- return;
- }
-
- // we shouldn't be authoritative...
- assert(!in->is_auth());
-
- dout(7) << "handle_lock_start " << *in << ", sending ack" << endl;
-
- // lock it
- in->dist_state |= CINODE_DIST_LOCKBYAUTH;
-
- // sanity check: make sure we know who _is_ authoritative!
- assert(m->get_asker() == in->authority());
-
- // send ack
- mds->messenger->send_message(new MInodeLockAck(in->ino()),
- MSG_ADDR_MDS(m->get_asker()), MDS_PORT_CACHE,
- MDS_PORT_CACHE);
-
- delete m; // done
-}
-
-
-void MDCache::handle_inode_lock_ack(MInodeLockAck *m)
-{
- CInode *in = get_inode(m->get_ino());
- int from = m->get_source();
- dout(7) << "handle_lock_ack from " << from << " on " << *in << endl;
-
- assert(in);
- assert(in->is_auth());
- assert(in->dist_state & CINODE_DIST_PRELOCK);
-
- // remove it from waiting list
- in->lock_waiting_for_ack.erase(from);
-
- if (!m->did_have()) {
- // erase from cached_by too!
- in->cached_by_remove(from);
- }
-
- if (in->lock_waiting_for_ack.size()) {
-
- // more coming
- dout(7) << "handle_lock_ack " << *in << " from " << from << ", still waiting for " << in->lock_waiting_for_ack << endl;
-
- } else {
-
- // yay!
- dout(7) << "handle_lock_ack " << *in << " from " << from << ", last one" << endl;
-
- in->dist_state &= ~CINODE_DIST_PRELOCK;
- in->dist_state |= CINODE_DIST_LOCKBYME;
- in->put(CINODE_PIN_PRELOCK);
-
- // do waiters!
- in->finish_waiting(CINODE_WAIT_LOCK);
- }
-
- delete m; // done
-}
-
-
-void MDCache::handle_inode_lock_release(MInodeLockRelease *m)
-{
- CInode *in = get_inode(m->get_ino());
-
- if (!in) {
- dout(7) << "handle_lock_release " << m->get_ino() << ", don't have it, dropping" << endl;
- delete m; // done
- return;
- }
-
- if (!in->is_lockbyauth()) {
- dout(7) << "handle_lock_release " << m->get_ino() << ", not flagged as locked, wtf" << endl;
- assert(0); // i should have it, locked, or not have it at all!
- delete m; // done
- return;
- }
-
- dout(7) << "handle_lock_release " << *in << endl;
- assert(!in->is_auth());
-
- // release state
- in->dist_state &= ~CINODE_DIST_LOCKBYAUTH;
-
- // waiters?
- if (in->is_waitonunlock()) {
- in->put(CINODE_PIN_WAITONUNLOCK);
- in->dist_state &= ~CINODE_DIST_WAITONUNLOCK;
-
- // finish
- in->finish_waiting(CINODE_WAIT_UNLOCK);
- }
-
- // done
- delete m;
-}
-*/
-
-
-
-
-
-
-
-
-
-// sync interface
-
-void MDCache::inode_sync_wait(CInode *in)
-{
- assert(!in->is_auth());
-
- int auth = in->authority();
- dout(5) << "inode_sync_wait on " << *in << ", auth " << auth << endl;
-
- assert(in->is_syncbyauth());
- assert(!in->is_waitonunsync());
-
- in->dist_state |= CINODE_DIST_WAITONUNSYNC;
- in->get(CINODE_PIN_WAITONUNSYNC);
-
- if ((in->is_softasync() && g_conf.mdcache_sticky_sync_softasync) ||
- (!in->is_softasync() && g_conf.mdcache_sticky_sync_normal)) {
- // actually recall; if !sticky, auth will immediately release.
- dout(5) << "inode_sync_wait on " << *in << " sticky, recalling from auth" << endl;
- mds->messenger->send_message(new MInodeSyncRecall(in->inode.ino),
- MSG_ADDR_MDS(auth), MDS_PORT_CACHE,
- MDS_PORT_CACHE);
- }
-}
-
-
-void MDCache::inode_sync_start(CInode *in)
-{
- // wait for all replicas
- dout(5) << "inode_sync_start on " << *in << ", waiting for " << in->cached_by << " " << in->get_open_write()<< endl;
-
- assert(in->is_auth());
- assert(!in->is_presync());
- assert(!in->is_sync());
-
- in->sync_waiting_for_ack.clear();
- in->dist_state |= CINODE_DIST_PRESYNC;
- in->get(CINODE_PIN_PRESYNC);
- in->auth_pin();
-
- in->sync_replicawantback = false;
-
- // send messages
- for (set<int>::iterator it = in->cached_by_begin();
- it != in->cached_by_end();
- it++) {
- in->sync_waiting_for_ack.insert(MSG_ADDR_MDS(*it));
- mds->messenger->send_message(new MInodeSyncStart(in->inode.ino, mds->get_nodeid()),
- MSG_ADDR_MDS(*it), MDS_PORT_CACHE,
- MDS_PORT_CACHE);
- }
-
- // sync clients
- int last = -1;
- for (multiset<int>::iterator it = in->get_open_write().begin();
- it != in->get_open_write().end();
- it++) {
- if (*it == last) continue; last = *it; // only 1 per client (even if open multiple times)
- in->sync_waiting_for_ack.insert(MSG_ADDR_CLIENT(*it));
- mds->messenger->send_message(new MInodeSyncStart(in->ino(), mds->get_nodeid()),
- MSG_ADDR_CLIENT(*it), 0,
- MDS_PORT_CACHE);
- }
-
-}
-
-void MDCache::inode_sync_release(CInode *in)
-{
- dout(5) << "inode_sync_release on " << *in << ", messages to " << in->get_cached_by() << " " << in->get_open_write() << endl;
-
- assert(in->is_syncbyme());
- assert(in->is_auth());
-
- in->dist_state &= ~CINODE_DIST_SYNCBYME;
-
- // release replicas
- for (set<int>::iterator it = in->cached_by_begin();
- it != in->cached_by_end();
- it++) {
- mds->messenger->send_message(new MInodeSyncRelease(in),
- MSG_ADDR_MDS(*it), MDS_PORT_CACHE,
- MDS_PORT_CACHE);
- }
-
- // release writers
- for (multiset<int>::iterator it = in->get_open_write().begin();
- it != in->get_open_write().end();
- it++) {
- mds->messenger->send_message(new MInodeSyncRelease(in),
- MSG_ADDR_CLIENT(*it), 0,
- MDS_PORT_CACHE);
- }
-
- in->auth_unpin();
-}
-
-
-
-
-// messages
-void MDCache::handle_inode_sync_start(MInodeSyncStart *m)
-{
- // assume asker == authority for now.
-
- // authority is requesting a lock
- CInode *in = get_inode(m->get_ino());
- if (!in) {
- // don't have it anymore!
- dout(7) << "handle_sync_start " << m->get_ino() << ": don't have it anymore, nak" << endl;
- mds->messenger->send_message(new MInodeSyncAck(m->get_ino(), false),
- MSG_ADDR_MDS(m->get_asker()), MDS_PORT_CACHE,
- MDS_PORT_CACHE);
- delete m; // done
- return;
- }
-
- dout(10) << "handle_sync_start " << *in << endl;
-
- // we shouldn't be authoritative...
- assert(!in->is_auth());
-
- // sanity check: make sure we know who _is_ authoritative!
- assert(m->get_asker() == in->authority());
-
- // lock it
- in->dist_state |= CINODE_DIST_SYNCBYAUTH;
-
- // open for write by clients?
- if (in->is_open_write()) {
- dout(7) << "handle_sync_start " << *in << " syncing write clients " << in->get_open_write() << endl;
-
- // sync clients
- in->sync_waiting_for_ack.clear();
- for (multiset<int>::iterator it = in->get_open_write().begin();
- it != in->get_open_write().end();
- it++) {
- in->sync_waiting_for_ack.insert(MSG_ADDR_CLIENT(*it));
- mds->messenger->send_message(new MInodeSyncStart(in->ino(), mds->get_nodeid()),
- MSG_ADDR_CLIENT(*it), 0,
- MDS_PORT_CACHE);
- }
-
- in->pending_sync_request = m;
- } else {
- // no writers, ack.
- dout(7) << "handle_sync_start " << *in << ", sending ack" << endl;
-
- inode_sync_ack(in, m);
- }
-}
-
-void MDCache::inode_sync_ack(CInode *in, MInodeSyncStart *m, bool wantback)
-{
- dout(7) << "sending inode_sync_ack " << *in << endl;
-
- // send ack
- mds->messenger->send_message(new MInodeSyncAck(in->ino(), true, wantback),
- MSG_ADDR_MDS(m->get_asker()), MDS_PORT_CACHE,
- MDS_PORT_CACHE);
-
- delete m;
-}
-
-void MDCache::handle_inode_sync_ack(MInodeSyncAck *m)
-{
- CInode *in = get_inode(m->get_ino());
- assert(in);
-
- dout(7) << "handle_sync_ack " << *in << " from " << m->get_source() << endl;
-
- if (in->is_auth()) {
- assert(in->is_presync());
- } else {
- assert(in->is_syncbyauth());
- assert(in->pending_sync_request);
- }
-
- // remove it from waiting list
- in->sync_waiting_for_ack.erase(m->get_source());
-
- if (MSG_ADDR_ISCLIENT(m->get_source()) && !m->did_have()) {
- // erase from cached_by too!
- in->cached_by_remove(m->get_source());
- }
-
- if (m->replica_wantsback())
- in->sync_replicawantback = true;
-
- if (in->sync_waiting_for_ack.size()) {
-
- // more coming
- dout(7) << "handle_sync_ack " << *in << " from " << m->get_source() << ", still waiting for " << in->sync_waiting_for_ack << endl;
-
- } else {
-
- // yay!
- dout(7) << "handle_sync_ack " << *in << " from " << m->get_source() << ", last one" << endl;
-
- if (!in->is_auth()) {
- // replica, sync ack back to auth
- assert(in->pending_sync_request);
- inode_sync_ack(in, in->pending_sync_request, true);
- in->pending_sync_request = 0;
- delete m;
- return;
- }
-
- in->dist_state &= ~CINODE_DIST_PRESYNC;
- in->dist_state |= CINODE_DIST_SYNCBYME;
- in->put(CINODE_PIN_PRESYNC);
-
- // do waiters!
- in->finish_waiting(CINODE_WAIT_SYNC);
-
-
- // release sync right away?
- if (in->is_syncbyme()) {
- if (in->is_freezing()) {
- dout(7) << "handle_sync_ack freezing " << *in << ", dropping sync immediately" << endl;
- inode_sync_release(in);
- }
- else if (in->sync_replicawantback) {
- dout(7) << "handle_sync_ack replica wantback, releasing sync immediately" << endl;
- inode_sync_release(in);
- }
- else if ((in->is_softasync() && !g_conf.mdcache_sticky_sync_softasync) ||
- (!in->is_softasync() && !g_conf.mdcache_sticky_sync_normal)) {
- dout(7) << "handle_sync_ack !sticky, releasing sync immediately" << endl;
- inode_sync_release(in);
- }
- else {
- dout(7) << "handle_sync_ack sticky sync is on, keeping sync for now" << endl;
- }
- } else {
- dout(7) << "handle_sync_ack don't have sync anymore, something must have just released it?" << endl;
- }
- }
-
- delete m; // done
-}
-
-
-void MDCache::handle_inode_sync_release(MInodeSyncRelease *m)
-{
- CInode *in = get_inode(m->get_ino());
-
- if (!in) {
- dout(7) << "handle_sync_release " << m->get_ino() << ", don't have it, dropping" << endl;
- delete m; // done
- return;
- }
-
- if (!in->is_syncbyauth()) {
- dout(7) << "handle_sync_release " << *in << ", not flagged as sync" << endl;
- assert(0); // this shouldn't happen.
- delete m; // done
- return;
- }
-
- dout(7) << "handle_sync_release " << *in << endl;
- assert(!in->is_auth());
-
- // release state
- in->dist_state &= ~CINODE_DIST_SYNCBYAUTH;
-
- // waiters?
- if (in->is_waitonunsync()) {
- in->put(CINODE_PIN_WAITONUNSYNC);
- in->dist_state &= ~CINODE_DIST_WAITONUNSYNC;
-
- // finish
- in->finish_waiting(CINODE_WAIT_UNSYNC);
- }
-
- // client readers?
- if (in->is_open_write()) {
- dout(7) << "handle_sync_release releasing clients " << in->get_open_write() << endl;
- for (multiset<int>::iterator it = in->get_open_write().begin();
- it != in->get_open_write().end();
- it++) {
- mds->messenger->send_message(new MInodeSyncRelease(in),
- MSG_ADDR_CLIENT(*it), 0,
- MDS_PORT_CACHE);
- }
- }
-
-
- // done
- delete m;
-}
-
-
-void MDCache::handle_inode_sync_recall(MInodeSyncRecall *m)
-{
- CInode *in = get_inode(m->get_ino());
-
- if (!in) {
- dout(7) << "handle_sync_recall " << m->get_ino() << ", don't have it, wtf" << endl;
- assert(0); // shouldn't happen
- delete m; // done
- return;
- }
- if(!in->is_auth()) {
- do_ino_proxy(in, m);
- return;
- }
-
- if (in->is_syncbyme()) {
- dout(7) << "handle_sync_recall " << *in << ", releasing" << endl;
- inode_sync_release(in);
- }
- else if (in->is_presync()) {
- dout(7) << "handle_sync_recall " << *in << " is presync, flagging" << endl;
- in->sync_replicawantback = true;
- }
- else {
- dout(7) << "handle_sync_recall " << m->get_ino() << ", not flagged as sync or presync, dropping" << endl;
- }
-
- // done
- delete m;
-}
-
-
-
-
-
-
-
-
-
-
-// DIR SYNC
-
-/*
-
- dir sync
-
- - this are used when a directory is HASHED only. namely,
- - to stat the dir inode we need an accurate directory size (????)
- - for a readdir
-
-*/
-
-void MDCache::dir_sync_start(CDir *dir)
-{
- // wait for all replicas
- dout(5) << "sync_start on " << *dir << endl;
-
- assert(dir->is_hashed());
- assert(dir->is_auth());
- assert(!dir->is_presync());
- assert(!dir->is_sync());
-
- dir->sync_waiting_for_ack = mds->get_cluster()->get_mds_set();
- dir->state_set(CDIR_STATE_PRESYNC);
- dir->auth_pin();
-
- //dir->sync_replicawantback = false;
-
- // send messages
- for (set<int>::iterator it = dir->sync_waiting_for_ack.begin();
- it != dir->sync_waiting_for_ack.end();
- it++) {
- mds->messenger->send_message(new MDirSyncStart(dir->ino(), mds->get_nodeid()),
- MSG_ADDR_MDS(*it), MDS_PORT_CACHE,
- MDS_PORT_CACHE);
- }
-}
-
-
-void MDCache::dir_sync_release(CDir *dir)
-{
-
-
-}
-
-void MDCache::dir_sync_wait(CDir *dir)
-{
-
-}
-
-
-void handle_dir_sync_start(MDirSyncStart *m)
-{
-}
-
-
-
-
#ifndef __MCACHEEXPIRE_H
#define __MCACHEEXPIRE_H
-
class MCacheExpire : public Message {
+ int from;
map<inodeno_t, int> inodes;
map<inodeno_t, int> dirs;
- int from;
+ map<inodeno_t, map<string,int> > dentries;
public:
+ int get_from() { return from; }
map<inodeno_t,int>& get_inodes() { return inodes; }
map<inodeno_t,int>& get_dirs() { return dirs; }
- int get_from() { return from; }
+ map<inodeno_t, map<string,int> >& get_dentries() { return dentries; }
MCacheExpire() {}
- MCacheExpire(int from) : Message(MSG_MDS_CACHEEXPIRE) {
- this->from = from;
- }
+ MCacheExpire(int f) :
+ Message(MSG_MDS_CACHEEXPIRE),
+ from(f) { }
+
virtual char *get_type_name() { return "CEx";}
void add_inode(inodeno_t ino, int nonce) {
- inodes.insert(pair<inodeno_t,int>(ino,nonce));
+ inodes[ino] = nonce;
}
void add_dir(inodeno_t ino, int nonce) {
- dirs.insert(pair<inodeno_t,int>(ino,nonce));
+ dirs[ino] = nonce;
+ }
+ void add_dentry(inodeno_t dirino, const string& dn, int nonce) {
+ dentries[dirino][dn] = nonce;
+ }
+ void add_dentries(inodeno_t dirino, map<string,int>& dmap) {
+ dentries[dirino] = dmap;
}
- virtual void decode_payload(crope& s, int& off) {
- int n;
+ void decode_payload() {
+ int off = 0;
- s.copy(off, sizeof(from), (char*)&from);
+ payload.copy(off, sizeof(from), (char*)&from);
off += sizeof(from);
- // inodes
- s.copy(off, sizeof(int), (char*)&n);
- off += sizeof(int);
- for (int i=0; i<n; i++) {
- inodeno_t ino;
- int nonce;
- s.copy(off, sizeof(ino), (char*)&ino);
- off += sizeof(ino);
- s.copy(off, sizeof(int), (char*)&nonce);
- off += sizeof(int);
- inodes.insert(pair<inodeno_t, int>(ino,nonce));
- }
+ ::_decode(inodes, payload, off);
+ ::_decode(dirs, payload, off);
- // dirs
- s.copy(off, sizeof(int), (char*)&n);
+ int n;
+ payload.copy(off, sizeof(int), (char*)&n);
off += sizeof(int);
for (int i=0; i<n; i++) {
inodeno_t ino;
- int nonce;
- s.copy(off, sizeof(ino), (char*)&ino);
+ payload.copy(off, sizeof(ino), (char*)&ino);
off += sizeof(ino);
- s.copy(off, sizeof(int), (char*)&nonce);
- off += sizeof(int);
- dirs.insert(pair<inodeno_t, int>(ino,nonce));
+ ::_decode(dentries[ino], payload, off);
}
}
-
- void rope_map(crope& s, map<inodeno_t,int>& mp) {
- int n = mp.size();
- s.append((char*)&n, sizeof(int));
- for (map<inodeno_t,int>::iterator it = mp.begin();
- it != mp.end();
- it++) {
- inodeno_t ino = it->first;
- int nonce = it->second;
- s.append((char*)&ino, sizeof(ino));
- s.append((char*)&nonce, sizeof(nonce));
- }
- }
- virtual void encode_payload(crope& s) {
- s.append((char*)&from, sizeof(from));
- rope_map(s, inodes);
- rope_map(s, dirs);
+ void encode_payload() {
+ payload.append((char*)&from, sizeof(from));
+ ::_encode(inodes, payload);
+ ::_encode(dirs, payload);
+
+ int n = dentries.size();
+ payload.append((char*)&n, sizeof(n));
+ for (map<inodeno_t, map<string,int> >::iterator p = dentries.begin();
+ p != dentries.end();
+ ++p) {
+ payload.append((char*)&p->first, sizeof(p->first));
+ ::_encode(p->second, payload);
+ }
}
};
#include "msg/Message.h"
#include "include/filepath.h"
+#include "mds/mdstypes.h"
#include "mds/MDS.h"
/**
string error_dentry; // dentry that was not found (to trigger waiters on asker)
- vector<CDirDiscover*> dirs; // not inode-aligned if no_base_dir = true.
- filepath path; // not inode-aligned if no_base_dentry = true
- vector<bool> path_xlock;
- vector<CInodeDiscover*> inodes;
+ vector<CDirDiscover*> dirs; // not inode-aligned if no_base_dir = true.
+ vector<CDentryDiscover*> dentries; // not inode-aligned if no_base_dentry = true
+ vector<CInodeDiscover*> inodes;
+
+ string path;
public:
// accessors
inodeno_t get_base_ino() { return base_ino; }
int get_num_inodes() { return inodes.size(); }
- int get_num_dentries() { return path.depth(); }
+ int get_num_dentries() { return dentries.size(); }
int get_num_dirs() { return dirs.size(); }
int get_depth() { // return depth of deepest object (in dir/dentry/inode units)
return max( inodes.size(), // at least this many
- max( no_base_dentry + path.depth() + flag_error_dn, // inode start + path + possible error
+ max( no_base_dentry + dentries.size() + flag_error_dn, // inode start + path + possible error
dirs.size() + no_base_dir )); // dn/inode + dirs
}
bool has_base_dir() { return !no_base_dir && dirs.size(); }
- bool has_base_dentry() { return !no_base_dentry && path.depth(); }
+ bool has_base_dentry() { return !no_base_dentry && dentries.size(); }
bool has_root() {
if (base_ino == 0) {
assert(no_base_dir && no_base_dentry);
}
return false;
}
- const string& get_path() { return path.get_path(); }
- bool get_path_xlock(int i) { return path_xlock[i]; }
+
+ const string& get_path() { return path; }
// bool is_flag_forward() { return flag_forward; }
bool is_flag_error_dn() { return flag_error_dn; }
// these index _arguments_ are aligned to each ([[dir, ] dentry, ] inode) set.
CDirDiscover& get_dir(int n) { return *(dirs[n - no_base_dir]); }
- const string& get_dentry(int n) { return path[n - no_base_dentry]; }
- bool get_dentry_xlock(int n) { return path_xlock[n - no_base_dentry]; }
+ CDentryDiscover& get_dentry(int n) { return *(dentries[n - no_base_dentry]); }
CInodeDiscover& get_inode(int n) { return *(inodes[n]); }
inodeno_t get_ino(int n) { return inodes[n]->get_ino(); }
// builders
bool is_empty() {
- return dirs.empty() && path.depth() == 0 &&
- inodes.empty() &&
+ return dirs.empty() && dentries.empty() && inodes.empty() &&
!flag_error_dn &&
!flag_error_dir;
}
- void set_path(const filepath& dp) { path = dp; }
- void add_dentry(const string& dn, bool xlock) {
- if (path.depth() == 0 && dirs.empty()) no_base_dir = true;
- path.add_dentry(dn);
- path_xlock.push_back(xlock);
+ void add_dentry(CDentryDiscover* ddis) {
+ if (dentries.empty() && dirs.empty()) no_base_dir = true;
+ dentries.push_back(ddis);
+ if (path.length()) path += "/";
+ path += ddis->get_dname();
}
-
+
void add_inode(CInodeDiscover* din) {
- if (inodes.empty() && path.depth() == 0) no_base_dir = no_base_dentry = true;
+ if (inodes.empty() && dentries.empty()) no_base_dir = no_base_dentry = true;
inodes.push_back( din );
}
}
//dout(12) << n << " inodes out" << endl;
- // filepath
- path._decode(payload, off);
- //dout(12) << path.depth() << " dentries out" << endl;
-
- // path_xlock
+ // dentries
payload.copy(off, sizeof(int), (char*)&n);
off += sizeof(int);
for (int i=0; i<n; i++) {
- bool b;
- payload.copy(off, sizeof(bool), (char*)&b);
- off += sizeof(bool);
- path_xlock.push_back(b);
+ dentries.push_back( new CDentryDiscover() );
+ dentries[i]->_decode(payload, off);
}
}
void encode_payload() {
(*it)->_encode( payload );
//dout(12) << n << " inodes in" << endl;
- // path
- path._encode( payload );
- //dout(12) << path.depth() << " dentries in" << endl;
-
- // path_xlock
- n = path_xlock.size();
+ // dentries
+ n = dentries.size();
payload.append((char*)&n, sizeof(int));
- for (vector<bool>::iterator it = path_xlock.begin();
- it != path_xlock.end();
- it++) {
- bool b = *it;
- payload.append((char*)&b, sizeof(bool));
- }
+ for (vector<CDentryDiscover*>::iterator it = dentries.begin();
+ it != dentries.end();
+ it++)
+ (*it)->_encode( payload );
+ //dout(12) << n << " dentries in" << endl;
}
};
class MExportDir : public Message {
inodeno_t ino;
- int ndirs;
- bufferlist state;
-
- list<inodeno_t> exports;
-
- // hashed pre-discovers
- //map<inodeno_t, set<string> > hashed_prediscover;
+ list<bufferlist> dirstate; // a bl for reach dir
+ list<inodeno_t> exports;
public:
MExportDir() {}
- MExportDir(CInode *in) :
- Message(MSG_MDS_EXPORTDIR) {
- this->ino = in->inode.ino;
- ndirs = 0;
+ MExportDir(inodeno_t dirino) :
+ Message(MSG_MDS_EXPORTDIR),
+ ino(dirino) {
}
virtual char *get_type_name() { return "Ex"; }
inodeno_t get_ino() { return ino; }
- int get_ndirs() { return ndirs; }
- bufferlist& get_state() { return state; }
+ list<bufferlist>& get_dirstate() { return dirstate; }
list<inodeno_t>& get_exports() { return exports; }
-
+
void add_dir(bufferlist& dir) {
- state.claim_append( dir );
- ndirs++;
+ dirstate.push_back(dir);
+ }
+ void set_dirstate(const list<bufferlist>& ls) {
+ dirstate = ls;
+ }
+ void add_export(inodeno_t dirino) {
+ exports.push_back(dirino);
}
- void add_export(CDir *dir) { exports.push_back(dir->ino()); }
-
virtual void decode_payload() {
int off = 0;
payload.copy(off, sizeof(ino), (char*)&ino);
off += sizeof(ino);
- payload.copy(off, sizeof(ndirs), (char*)&ndirs);
- off += sizeof(ndirs);
-
- // exports
- int nex;
- payload.copy(off, sizeof(nex), (char*)&nex);
- off += sizeof(int);
- dout(12) << nex << " nested exports out" << endl;
- for (int i=0; i<nex; i++) {
- inodeno_t dirino;
- payload.copy(off, sizeof(dirino), (char*)&dirino);
- off += sizeof(dirino);
- exports.push_back(dirino);
- }
-
- // dir data
- size_t len;
- payload.copy(off, sizeof(len), (char*)&len);
- off += sizeof(len);
- state.substr_of(payload, off, len);
- off += len;
+ ::_decode(exports, payload, off);
+ ::_decode(dirstate, payload, off);
}
virtual void encode_payload() {
payload.append((char*)&ino, sizeof(ino));
- payload.append((char*)&ndirs, sizeof(ndirs));
-
- // exports
- int nex = exports.size();
- dout(12) << nex << " nested exports in" << endl;
- payload.append((char*)&nex, sizeof(int));
- for (list<inodeno_t>::iterator it = exports.begin();
- it != exports.end();
- it++) {
- inodeno_t ino = *it;
- payload.append((char*)&ino, sizeof(ino));
- }
-
- // dir data
- size_t len = state.length();
- payload.append((char*)&len, sizeof(len));
- payload.claim_append(state);
+ ::_encode(exports, payload);
+ ::_encode(dirstate, payload);
}
};
void add_export(inodeno_t dirino) {
exports.push_back( dirino );
}
- void add_inode(inodeno_t dirino, string& dentry, CInodeDiscover *in) {
+ void add_inode(inodeno_t dirino, const string& dentry, CInodeDiscover *in) {
inodes.push_back(in);
inode_dirino.insert(pair<inodeno_t, inodeno_t>(in->get_ino(), dirino));
inode_dentry.insert(pair<inodeno_t, string>(in->get_ino(), dentry));
class MFailure : public Message {
public:
- msg_addr_t failed;
+ entity_name_t failed;
entity_inst_t inst;
MFailure() {}
- MFailure(msg_addr_t f, entity_inst_t& i) :
+ MFailure(entity_name_t f, entity_inst_t& i) :
Message(MSG_FAILURE),
failed(f), inst(i) {}
- msg_addr_t get_failed() { return failed; }
+ entity_name_t get_failed() { return failed; }
entity_inst_t& get_inst() { return inst; }
void decode_payload() {
class MFailureAck : public Message {
public:
- msg_addr_t failed;
+ entity_name_t failed;
MFailureAck(MFailure *m) : Message(MSG_FAILURE_ACK) {
this->failed = m->get_failed();
}
MFailureAck() {}
- msg_addr_t get_failed() { return failed; }
+ entity_name_t get_failed() { return failed; }
virtual void decode_payload(crope& s, int& off) {
s.copy(0, sizeof(failed), (char*)&failed);
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef __MMDSBEACON_H
+#define __MMDSBEACON_H
+
+#include "msg/Message.h"
+
+#include "include/types.h"
+
+#include "mds/MDSMap.h"
+
+class MMDSBeacon : public Message {
+ int state;
+ version_t seq;
+
+ public:
+ MMDSBeacon() : Message(MSG_MDS_BEACON) {}
+ MMDSBeacon(int st, version_t se) : Message(MSG_MDS_BEACON),
+ state(st), seq(se) { }
+
+ int get_state() { return state; }
+ version_t get_seq() { return seq; }
+ char *get_type_name() { return "mdsbeacon"; }
+
+ void print(ostream& out) {
+ out << "mdsbeacon(" << MDSMap::get_state_name(state)
+ << " seq " << seq << ")";
+ }
+
+ void encode_payload() {
+ payload.append((char*)&state, sizeof(state));
+ payload.append((char*)&seq, sizeof(seq));
+ }
+ void decode_payload() {
+ int off = 0;
+ payload.copy(off, sizeof(state), (char*)&state);
+ off += sizeof(state);
+ payload.copy(off, sizeof(seq), (char*)&seq);
+ off += sizeof(seq);
+ }
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef __MMDSCACHEREJOIN_H
+#define __MMDSCACHEREJOIN_H
+
+#include "msg/Message.h"
+
+#include "include/types.h"
+
+// sent from replica to auth
+
+class MMDSCacheRejoin : public Message {
+ public:
+ map<inodeno_t,int> inodes; // ino -> caps_wanted
+ set<inodeno_t> dirs;
+ map<inodeno_t, set<string> > dentries; // dir -> (dentries...)
+
+ MMDSCacheRejoin() : Message(MSG_MDS_CACHEREJOIN) {}
+
+ char *get_type_name() { return "cache_rejoin"; }
+
+ void print(ostream& out) {
+ out << "cache_rejoin" << endl;
+ }
+
+ void add_dir(inodeno_t dirino) {
+ dirs.insert(dirino);
+ }
+ void add_dentry(inodeno_t dirino, const string& dn) {
+ dentries[dirino].insert(dn);
+ }
+ void add_inode(inodeno_t ino, int cw) {
+ inodes[ino] = cw;
+ }
+
+ void encode_payload() {
+ ::_encode(inodes, payload);
+ ::_encode(dirs, payload);
+ for (set<inodeno_t>::iterator p = dirs.begin(); p != dirs.end(); ++p)
+ ::_encode(dentries[*p], payload);
+ }
+ void decode_payload() {
+ int off = 0;
+ ::_decode(inodes, payload, off);
+ ::_decode(dirs, payload, off);
+ for (set<inodeno_t>::iterator p = dirs.begin(); p != dirs.end(); ++p)
+ ::_decode(dentries[*p], payload, off);
+ }
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef __MMDSCACHEREJOINACK_H
+#define __MMDSCACHEREJOINACK_H
+
+#include "msg/Message.h"
+
+#include "include/types.h"
+
+// sent from auth back to replica
+
+class MMDSCacheRejoinAck : public Message {
+ public:
+ struct inodeinfo {
+ inodeno_t ino;
+ int hardlock;
+ int filelock;
+ int nonce;
+ inodeinfo() {}
+ inodeinfo(inodeno_t i, int h, int f, int n) : ino(i), hardlock(h), filelock(f), nonce(n) {}
+ };
+ struct dninfo {
+ int lock;
+ int nonce;
+ dninfo() {}
+ dninfo(int l, int n) : lock(l), nonce(n) {}
+ };
+ struct dirinfo {
+ inodeno_t dirino;
+ int nonce;
+ dirinfo() {}
+ dirinfo(inodeno_t i, int n) : dirino(i), nonce(n) {}
+ };
+ list<inodeinfo> inodes;
+ map<inodeno_t, map<string,dninfo> > dentries;
+ list<dirinfo> dirs;
+
+ MMDSCacheRejoinAck() : Message(MSG_MDS_CACHEREJOINACK) {}
+
+ char *get_type_name() { return "cache_rejoin_ack"; }
+
+ void print(ostream& out) {
+ out << "cache_rejoin" << endl;
+ }
+
+ void add_dir(inodeno_t dirino, int nonce) {
+ dirs.push_back(dirinfo(dirino,nonce));
+ }
+ void add_dentry(inodeno_t dirino, const string& dn, int ls, int nonce) {
+ dentries[dirino][dn] = dninfo(ls, nonce);
+ }
+ void add_inode(inodeno_t ino, int hl, int fl, int nonce) {
+ inodes.push_back(inodeinfo(ino, hl, fl, nonce));
+ }
+
+ void encode_payload() {
+ ::_encode(inodes, payload);
+ ::_encode(dirs, payload);
+ for (list<dirinfo>::iterator p = dirs.begin(); p != dirs.end(); ++p)
+ ::_encode(dentries[p->dirino], payload);
+ }
+ void decode_payload() {
+ int off = 0;
+ ::_decode(inodes, payload, off);
+ ::_decode(dirs, payload, off);
+ for (list<dirinfo>::iterator p = dirs.begin(); p != dirs.end(); ++p)
+ ::_decode(dentries[p->dirino], payload, off);
+ }
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef __MMDSIMPORTMAP_H
+#define __MMDSIMPORTMAP_H
+
+#include "msg/Message.h"
+
+#include "include/types.h"
+
+
+class MMDSImportMap : public Message {
+ public:
+ map<inodeno_t, set<inodeno_t> > imap;
+ map<inodeno_t, set<inodeno_t> > ambiguous_imap;
+
+ MMDSImportMap() : Message(MSG_MDS_IMPORTMAP) {}
+
+ char *get_type_name() { return "mdsimportmap"; }
+
+ void print(ostream& out) {
+ out << "mdsimportmap(" << imap.size()
+ << "+" << ambiguous_imap.size()
+ << " imports)";
+ }
+
+ void add_import(inodeno_t im) {
+ imap[im].clear();
+ }
+ void add_import_export(inodeno_t im, inodeno_t ex) {
+ imap[im].insert(ex);
+ }
+
+ void add_ambiguous_import(inodeno_t im, const set<inodeno_t>& m) {
+ ambiguous_imap[im] = m;
+ }
+
+ void encode_payload() {
+ ::_encode(imap, payload);
+ ::_encode(ambiguous_imap, payload);
+ }
+ void decode_payload() {
+ int off = 0;
+ ::_decode(imap, payload, off);
+ ::_decode(ambiguous_imap, payload, off);
+ }
+};
+
+#endif
class MMDSMap : public Message {
public:
+ /*
map<epoch_t, bufferlist> maps;
map<epoch_t, bufferlist> incremental_maps;
(e == 0 || i->first > e)) e = i->first;
return e;
}
+ */
+ version_t epoch;
+ bufferlist encoded;
+
+ version_t get_epoch() const { return epoch; }
+ bufferlist& get_encoded() { return encoded; }
MMDSMap() :
Message(MSG_MDS_MAP) {}
MMDSMap(MDSMap *mm) :
Message(MSG_MDS_MAP) {
- mm->encode(maps[mm->get_epoch()]);
+ epoch = mm->get_epoch();
+ mm->encode(encoded);
}
// marshalling
virtual void decode_payload() {
int off = 0;
- ::_decode(maps, payload, off);
- ::_decode(incremental_maps, payload, off);
+ payload.copy(off, sizeof(epoch), (char*)&epoch);
+ off += sizeof(epoch);
+ ::_decode(encoded, payload, off);
}
virtual void encode_payload() {
- ::_encode(maps, payload);
- ::_encode(incremental_maps, payload);
+ payload.append((char*)&epoch, sizeof(epoch));
+ ::_encode(encoded, payload);
}
virtual char *get_type_name() { return "mdsmap"; }
#include "msg/Message.h"
class MNSLookup : public Message {
- msg_addr_t entity;
+ entity_name_t entity;
public:
MNSLookup() {}
- MNSLookup(msg_addr_t e) :
+ MNSLookup(entity_name_t e) :
Message(MSG_NS_LOOKUP) {
entity = e;
}
char *get_type_name() { return "NSLook"; }
- msg_addr_t get_entity() { return entity; }
+ entity_name_t get_entity() { return entity; }
void encode_payload() {
payload.append((char*)&entity, sizeof(entity));
class MNSLookupReply : public Message {
public:
- map<msg_addr_t, entity_inst_t> entity_map;
+ map<entity_name_t, entity_inst_t> entity_map;
public:
MNSLookupReply() {}
#include "msg/TCPMessenger.h"
class MNSRegister : public Message {
- msg_addr_t addr;
+ entity_name_t addr;
int rank;
long tid;
public:
MNSRegister() {}
- MNSRegister(msg_addr_t a, int r, int ti) :
+ MNSRegister(entity_name_t a, int r, int ti) :
Message(MSG_NS_REGISTER) {
addr = a;
rank = r;
char *get_type_name() { return "NSReg"; }
- msg_addr_t get_entity() { return addr; }
+ entity_name_t get_entity() { return addr; }
int get_rank() { return rank; }
long get_tid() { return tid; }
#include "msg/TCPMessenger.h"
class MNSRegisterAck : public Message {
- msg_addr_t entity;
+ entity_name_t entity;
long tid;
public:
MNSRegisterAck() {}
- MNSRegisterAck(long t, msg_addr_t e) :
+ MNSRegisterAck(long t, entity_name_t e) :
Message(MSG_NS_REGISTERACK) {
entity = e;
tid = t;
char *get_type_name() { return "NSRegA"; }
- msg_addr_t get_entity() { return entity; }
+ entity_name_t get_entity() { return entity; }
long get_tid() { return tid; }
void encode_payload() {
#include "msg/Message.h"
#include "include/types.h"
+#include "osd/osd_types.h"
class MOSDBoot : public Message {
public:
class MOSDFailure : public Message {
public:
- msg_addr_t failed;
- entity_inst_t inst;
+ entity_inst_t failed;
epoch_t epoch;
MOSDFailure() {}
- MOSDFailure(msg_addr_t f, const entity_inst_t& i, epoch_t e) :
+ MOSDFailure(entity_inst_t f, epoch_t e) :
Message(MSG_OSD_FAILURE),
- failed(f), inst(i), epoch(e) {}
+ failed(f), epoch(e) {}
- msg_addr_t get_failed() { return failed; }
- entity_inst_t& get_inst() { return inst; }
+ entity_inst_t get_failed() { return failed; }
epoch_t get_epoch() { return epoch; }
void decode_payload() {
int off = 0;
payload.copy(off, sizeof(failed), (char*)&failed);
off += sizeof(failed);
- payload.copy(off, sizeof(inst), (char*)&inst);
- off += sizeof(inst);
payload.copy(off, sizeof(epoch), (char*)&epoch);
off += sizeof(epoch);
}
void encode_payload() {
payload.append((char*)&failed, sizeof(failed));
- payload.append((char*)&inst, sizeof(inst));
payload.append((char*)&epoch, sizeof(epoch));
}
#define __MOSDOP_H
#include "msg/Message.h"
+#include "osd/osd_types.h"
/*
* OSD op
*
*/
-//#define OSD_OP_MKFS 20
-
-// client ops
+// osd client ops
#define OSD_OP_READ 1
#define OSD_OP_STAT 2
#define OSD_OP_PUSH 31
-typedef struct {
- long pcid;
-
- // who's asking?
- tid_t tid;
- msg_addr_t client;
- entity_inst_t client_inst;
-
- // for replication
- tid_t rep_tid;
-
- object_t oid;
- objectrev_t rev;
- pg_t pg;
-
- epoch_t map_epoch;
-
- eversion_t pg_trim_to; // primary->replica: trim to here
-
- int op;
- size_t length, offset;
- eversion_t version;
- eversion_t old_version;
-
- bool want_ack;
- bool want_commit;
-} MOSDOp_st;
-
class MOSDOp : public Message {
public:
static const char* get_opname(int op) {
}
private:
- MOSDOp_st st;
+ struct {
+ long pcid;
+
+ // who's asking?
+ entity_inst_t client;
+ reqid_t reqid; // minor weirdness: entity_name_t is in reqid_t too.
+
+ // for replication
+ tid_t rep_tid;
+
+ object_t oid;
+ objectrev_t rev;
+ pg_t pg;
+
+ epoch_t map_epoch;
+
+ eversion_t pg_trim_to; // primary->replica: trim to here
+
+ int op;
+ size_t length, offset;
+ eversion_t version;
+ eversion_t old_version;
+
+ bool want_ack;
+ bool want_commit;
+ } st;
+
bufferlist data;
map<string,bufferptr> attrset;
friend class MOSDOpReply;
public:
- const tid_t get_tid() { return st.tid; }
- const msg_addr_t& get_client() { return st.client; }
- const entity_inst_t& get_client_inst() { return st.client_inst; }
- void set_client_inst(const entity_inst_t& i) { st.client_inst = i; }
+ const reqid_t& get_reqid() { return st.reqid; }
+ const tid_t get_client_tid() { return st.reqid.tid; }
+ int get_client_inc() { return st.reqid.inc; }
+
+ const entity_name_t& get_client() { return st.client.name; }
+ const entity_inst_t& get_client_inst() { return st.client; }
+ void set_client_inst(const entity_inst_t& i) { st.client = i; }
const tid_t get_rep_tid() { return st.rep_tid; }
void set_rep_tid(tid_t t) { st.rep_tid = t; }
- const object_t get_oid() { return st.oid; }
- const pg_t get_pg() { return st.pg; }
+ const object_t get_oid() { return st.oid; }
+ const pg_t get_pg() { return st.pg; }
const epoch_t get_map_epoch() { return st.map_epoch; }
//const int get_pg_role() { return st.pg_role; } // who am i asking for?
void set_pcid(long pcid) { this->st.pcid = pcid; }
long get_pcid() { return st.pcid; }
- MOSDOp(long tid, msg_addr_t asker,
+ MOSDOp(entity_inst_t asker, int inc, long tid,
object_t oid, pg_t pg, epoch_t mapepoch, int op) :
Message(MSG_OSD_OP) {
memset(&st, 0, sizeof(st));
this->st.client = asker;
- this->st.tid = tid;
- this->st.rep_tid = 0;
+ this->st.reqid.name = asker.name;
+ this->st.reqid.inc = inc;
+ this->st.reqid.tid = tid;
this->st.oid = oid;
this->st.pg = pg;
this->st.map_epoch = mapepoch;
this->st.op = op;
+ this->st.rep_tid = 0;
+
this->st.want_ack = true;
this->st.want_commit = true;
}
}
virtual char *get_type_name() { return "oop"; }
+
+ void print(ostream& out) {
+ out << "osd_op(" << st.reqid
+ << " " << get_opname(st.op)
+ << " " << st.oid
+ //<< " " << this
+ << ")";
+ }
};
-inline ostream& operator<<(ostream& out, MOSDOp& op)
-{
- return out << "MOSDOp(" << op.get_client() << "." << op.get_tid()
- << " op " << MOSDOp::get_opname(op.get_op())
- << " oid " << hex << op.get_oid() << dec << " " << &op << ")";
-}
#endif
*
*/
-
-typedef struct {
- // req
- long pcid;
- tid_t tid;
- tid_t rep_tid;
-
- object_t oid;
- pg_t pg;
-
- int op;
-
- // reply
- int result;
- bool commit;
- size_t length, offset;
- size_t object_size;
- eversion_t version;
-
- eversion_t pg_complete_thru;
-
- epoch_t map_epoch;
-} MOSDOpReply_st;
-
-
class MOSDOpReply : public Message {
- MOSDOpReply_st st;
+ struct {
+ // req
+ reqid_t reqid;
+
+ tid_t rep_tid;
+
+ object_t oid;
+ pg_t pg;
+
+ int op;
+
+ // reply
+ int result;
+ bool commit;
+ size_t length, offset;
+ size_t object_size;
+ eversion_t version;
+
+ eversion_t pg_complete_thru;
+
+ epoch_t map_epoch;
+ } st;
+
bufferlist data;
map<string,bufferptr> attrset;
public:
- long get_tid() { return st.tid; }
+ const reqid_t& get_reqid() { return st.reqid; }
+ long get_tid() { return st.reqid.tid; }
long get_rep_tid() { return st.rep_tid; }
object_t get_oid() { return st.oid; }
pg_t get_pg() { return st.pg; }
void set_attrset(map<string,bufferptr> &as) { attrset = as; }
void set_op(int op) { st.op = op; }
- void set_tid(tid_t t) { st.tid = t; }
void set_rep_tid(tid_t t) { st.rep_tid = t; }
// data payload
// osdmap
epoch_t get_map_epoch() { return st.map_epoch; }
- // keep a pcid (procedure call id) to match up request+reply
- void set_pcid(long pcid) { this->st.pcid = pcid; }
- long get_pcid() { return st.pcid; }
public:
MOSDOpReply(MOSDOp *req, int result, epoch_t e, bool commit) :
Message(MSG_OSD_OPREPLY) {
memset(&st, 0, sizeof(st));
- this->st.pcid = req->st.pcid;
-
+ this->st.reqid = req->st.reqid;
this->st.op = req->st.op;
- this->st.tid = req->st.tid;
this->st.rep_tid = req->st.rep_tid;
this->st.oid = req->st.oid;
}
virtual char *get_type_name() { return "oopr"; }
+
+ void print(ostream& out) {
+ out << "osd_op_reply(" << st.reqid
+ << " " << MOSDOp::get_opname(st.op)
+ << " " << st.oid << " = " << st.result
+ //<< " " << this
+ << ")";
+ }
+
};
+
#endif
#include "mon/MonMap.h"
-bool parse_ip_port(const char *s, tcpaddr_t& tcpaddr)
-{
- unsigned char addr[4];
- int port = 0;
-
- int count = 0; // digit count
-
- while (1) {
- // parse the #.
- int val = 0;
- int numdigits = 0;
-
- while (*s >= '0' && *s <= '9') {
- int digit = *s - '0';
- //cout << "digit " << digit << endl;
- val *= 10;
- val += digit;
- numdigits++;
- s++;
- }
- //cout << "val " << val << endl;
-
- if (numdigits == 0) return false; // no digits
- if (count < 3 && *s != '.') return false; // should have 3 periods
- if (count == 3 && *s != ':') return false; // then a colon
- s++;
-
- if (count <= 3)
- addr[count] = val;
- else
- port = val;
-
- count++;
- if (count == 5) break;
- }
-
- // copy into inst
- memset(&tcpaddr, 0, sizeof(addr));
- tcpaddr.sin_family = AF_INET;
- memcpy((char*)&tcpaddr.sin_addr.s_addr, (char*)addr, 4);
- tcpaddr.sin_port = htons(port);
- return true;
-}
int main(int argc, char **argv)
outfn = args[++i];
else {
// parse ip:port
- tcpaddr_t addr;
- if (!parse_ip_port(args[i], addr)) {
+ entity_inst_t inst;
+ if (!parse_ip_port(args[i], inst.addr)) {
cerr << "mkmonmap: invalid ip:port '" << args[i] << "'" << endl;
return -1;
}
-
- entity_inst_t inst;
- inst.set_addr(addr);
- cout << "mkmonmap: mon" << monmap.num_mon << " " << inst << endl;
+ inst.name = MSG_ADDR_MON(monmap.num_mon);
+ cout << "mkmonmap: adding " << inst << endl;
monmap.add_mon(inst);
}
}
assert(m->get_source().is_client());
int from = m->get_source().num();
- // choose an MDS id
+ // choose a client id
if (from < 0 ||
- (client_map.count(m->get_source()) && client_map[m->get_source()] != m->get_source_inst())) {
+ (client_map.count(m->get_source()) && client_map[m->get_source()] != m->get_source_addr())) {
from = ++num_clients;
dout(10) << "client_boot assigned client" << from << endl;
}
- client_map[MSG_ADDR_CLIENT(from)] = m->get_source_inst();
+ client_map[MSG_ADDR_CLIENT(from)] = m->get_source_addr();
// reply with latest mds map
- mon->mdsmon->send_latest(MSG_ADDR_CLIENT(from), m->get_source_inst());
+ entity_inst_t to = m->get_source_inst();
+ to.name = MSG_ADDR_CLIENT(from);
+ mon->mdsmon->send_latest(to);
delete m;
}
private:
int num_clients;
- map<msg_addr_t,entity_inst_t> client_map;
+ map<entity_name_t,entity_addr_t> client_map;
void bcast_latest_mds();
for (int i=0; i<mon->monmap->num_mon; ++i) {
if (i == whoami) continue;
mon->messenger->send_message(new MMonElectionPropose,
- MSG_ADDR_MON(i), mon->monmap->get_inst(i));
+ mon->monmap->get_inst(i));
}
reset_timer();
leader_acked = who;
ack_stamp = g_clock.now();
mon->messenger->send_message(new MMonElectionAck,
- MSG_ADDR_MON(who), mon->monmap->get_inst(who));
+ mon->monmap->get_inst(who));
// set a timer
reset_timer();
expire_event);
}
+
void Elector::cancel_timer()
{
if (expire_event)
for (int i=0; i<mon->monmap->num_mon; ++i) {
if (i == whoami) continue;
mon->messenger->send_message(new MMonElectionVictory,
- MSG_ADDR_MON(i), mon->monmap->get_inst(i));
+ mon->monmap->get_inst(i));
}
// tell monitor
#include "MDSMonitor.h"
#include "Monitor.h"
-#include "messages/MMDSBoot.h"
#include "messages/MMDSMap.h"
#include "messages/MMDSGetMap.h"
-//#include "messages/MMDSFailure.h"
+#include "messages/MMDSBeacon.h"
#include "common/Timer.h"
{
mdsmap.epoch = 0; // until everyone boots
mdsmap.ctime = g_clock.now();
- for (int i=0; i<g_conf.num_mds; i++) {
- mdsmap.all_mds.insert(i);
- mdsmap.down_mds.insert(i);
- }
+
+ print_map();
}
void MDSMonitor::dispatch(Message *m)
{
switch (m->get_type()) {
- case MSG_MDS_BOOT:
- handle_mds_boot((MMDSBoot*)m);
+ case MSG_MDS_BEACON:
+ handle_mds_beacon((MMDSBeacon*)m);
break;
case MSG_MDS_GETMAP:
handle_mds_getmap((MMDSGetMap*)m);
break;
- /*
- case MSG_MDS_FAILURE:
- handle_mds_failure((MMDSFailure*)m);
- break;
- */
-
- case MSG_SHUTDOWN:
- handle_mds_shutdown(m);
- break;
-
default:
assert(0);
}
}
-void MDSMonitor::handle_mds_boot(MMDSBoot *m)
+void MDSMonitor::print_map()
{
- dout(7) << "mds_boot from " << m->get_source() << " at " << m->get_source_inst() << endl;
- assert(m->get_source().is_mds());
+ dout(7) << "print_map epoch " << mdsmap.get_epoch() << endl;
+ entity_inst_t blank;
+ set<int> all;
+ mdsmap.get_mds_set(all);
+ for (set<int>::iterator p = all.begin();
+ p != all.end();
+ ++p) {
+ dout(7) << " mds" << *p << "." << mdsmap.mds_inc[*p]
+ << " : " << MDSMap::get_state_name(mdsmap.get_state(*p))
+ << " : " << (mdsmap.have_inst(*p) ? mdsmap.get_inst(*p) : blank)
+ << endl;
+ }
+}
+
+
+
+void MDSMonitor::handle_mds_beacon(MMDSBeacon *m)
+{
+ dout(7) << "mds_beacon " << *m
+ << " from " << m->get_source()
+ << " " << m->get_source_inst()
+ << endl;
int from = m->get_source().num();
+ int state = m->get_state();
+ version_t seq = m->get_seq();
+ // initial boot?
+ bool booted = false;
+
// choose an MDS id
- if (from < 0 || !mdsmap.is_down(from)) {
- for (from=0; ; ++from)
- if (mdsmap.is_down(from)) break;
- dout(10) << "mds_boot assigned mds" << from << endl;
+ if (from >= 0) {
+ // wants to be (or already is) a specific MDS.
+ if (mdsmap.is_down(from)) {
+ dout(10) << "mds_beacon assigning requested mds" << from << endl;
+ booted = true;
+ } else if (mdsmap.get_inst(from) != m->get_source_inst()) {
+ dout(10) << "mds_beacon not assigning requested mds" << from
+ << ", that mds is up and someone else" << endl;
+ from = -1;
+ }
+ }
+ if (from < 0) {
+ // pick a failed mds?
+ set<int> failed;
+ mdsmap.get_failed_mds_set(failed);
+ if (!failed.empty()) {
+ from = *failed.begin();
+ dout(10) << "mds_beacon assigned failed mds" << from << endl;
+ booted = true;
+ }
+ }
+ if (from < 0) {
+ // ok, just pick any unused mds id.
+ for (from=0; ; ++from) {
+ if (mdsmap.is_dne(from) ||
+ mdsmap.is_out(from)) {
+ dout(10) << "mds_beacon assigned out|dne mds" << from << endl;
+ booted = true;
+ break;
+ }
+ }
+ }
+
+
+ // old beacon?
+ if (mdsmap.mds_state_seq[from] > seq) {
+ dout(7) << "mds_beacon " << *m << " has old seq, ignoring" << endl;
+ delete m;
+ return;
}
- if (mdsmap.get_epoch() == 0) {
- // waiting for boot!
- mdsmap.mds_inst[from] = m->get_source_inst();
- mdsmap.down_mds.erase(from);
+ // reply to beacon?
+ if (state != MDSMap::STATE_OUT) {
+ last_beacon[from] = g_clock.now(); // note time
+ messenger->send_message(new MMDSBeacon(state, seq),
+ m->get_source_inst());
+ }
+
+
+ // make sure it's in the map
+ if (booted) {
+ mdsmap.mds_inst[from].addr = m->get_source_addr();
+ mdsmap.mds_inst[from].name = MSG_ADDR_MDS(from);
+ mdsmap.mds_inc[from]++;
+
+ // starting -> creating|starting|replay
+ if (mdsmap.is_degraded() &&
+ !mdsmap.is_failed(from)) {
+ dout(10) << "mds_beacon currently degraded, mds" << from << " will be standby" << endl;
+ state = MDSMap::STATE_STANDBY;
+ }
+ else if (state == MDSMap::STATE_STARTING) {
+ if (mdsmap.is_failed(from)) {
+ dout(10) << "mds_beacon will recover mds" << from << endl;
+ state = MDSMap::STATE_REPLAY;
+ }
+ else if (mdsmap.is_out(from)) {
+ dout(10) << "mds_beacon will start mds" << from << endl;
+ state = MDSMap::STATE_STARTING;
+ }
+ else {
+ dout(10) << "mds_beacon will create mds" << from << endl;
+ state = MDSMap::STATE_CREATING;
+ }
+ }
+ }
+
+ // if creating -> active, go to standby instead
+ if (state == MDSMap::STATE_ACTIVE && mdsmap.is_creating(from)) {
+ mdsmap.mds_created.insert(from);
+ dout(10) << "mds_beacon created mds" << from << endl;
- if ((int)mdsmap.mds_inst.size() == mdsmap.get_num_mds()) {
- mdsmap.inc_epoch();
- dout(-7) << "mds_boot all MDSs booted." << endl;
- mdsmap.encode(maps[mdsmap.get_epoch()]); // 1
-
- bcast_latest_mds();
- send_current();
- } else {
- dout(7) << "mds_boot waiting for "
- << (mdsmap.get_num_mds() - mdsmap.mds_inst.size())
- << " mdss to boot" << endl;
+ if (mdsmap.is_degraded()) {
+ dout(10) << "mds_beacon current degraded, marking mds" << from << " as standby" << endl;
+ state = MDSMap::STATE_STANDBY;
}
- return;
- } else {
- dout(0) << "mds_boot everyone already booted, so who is this? write me." << endl;
- assert(0);
}
-}
-void MDSMonitor::handle_mds_shutdown(Message *m)
-{
- assert(m->get_source().is_mds());
- int from = m->get_source().num();
- mdsmap.mds_inst.erase(from);
- mdsmap.all_mds.erase(from);
+ // did we update the map?
+ if (mdsmap.mds_state.count(from) == 0 ||
+ mdsmap.mds_state[from] != state) {
+ // update mds state
+ dout(10) << "mds_beacon mds" << from << " " << MDSMap::get_state_name(mdsmap.mds_state[from])
+ << " -> " << MDSMap::get_state_name(state)
+ << endl;
+ mdsmap.mds_state[from] = state;
+ if (mdsmap.is_up(from))
+ mdsmap.mds_state_seq[from] = seq;
+ else
+ mdsmap.mds_state_seq.erase(from);
+
+ // inc map version
+ mdsmap.inc_epoch();
+ mdsmap.encode(maps[mdsmap.get_epoch()]);
+
+ print_map();
+
+ // bcast map
+ bcast_latest_mds();
+ send_current();
+ }
- dout(7) << "mds_shutdown from " << m->get_source()
- << ", still have " << mdsmap.all_mds
- << endl;
-
- // tell someone?
- // fixme
-
delete m;
}
{
dout(7) << "mds_getmap from " << m->get_source() << " " << m->get_source_inst() << endl;
if (mdsmap.get_epoch() > 0)
- send_full(m->get_source(), m->get_source_inst());
+ send_full(m->get_source_inst());
else
- awaiting_map[m->get_source()] = m->get_source_inst();
+ awaiting_map.push_back( m->get_source_inst() );
}
dout(10) << "bcast_latest_mds " << mdsmap.get_epoch() << endl;
// tell mds
- for (set<int>::iterator p = mdsmap.get_mds().begin();
- p != mdsmap.get_mds().end();
- p++) {
- if (mdsmap.is_down(*p)) continue;
- send_full(MSG_ADDR_MDS(*p), mdsmap.get_inst(*p));
- }
+ set<int> up;
+ mdsmap.get_up_mds_set(up);
+ for (set<int>::iterator p = up.begin();
+ p != up.end();
+ p++)
+ send_full(mdsmap.get_inst(*p));
}
-void MDSMonitor::send_full(msg_addr_t dest, const entity_inst_t& inst)
+void MDSMonitor::send_full(entity_inst_t dest)
{
- dout(11) << "send_full to " << dest << " inst " << inst << endl;
- messenger->send_message(new MMDSMap(&mdsmap), dest, inst);
+ dout(11) << "send_full to " << dest << endl;
+ messenger->send_message(new MMDSMap(&mdsmap), dest);
}
void MDSMonitor::send_current()
{
dout(10) << "mds_send_current " << mdsmap.get_epoch() << endl;
- for (map<msg_addr_t,entity_inst_t>::iterator i = awaiting_map.begin();
+ for (list<entity_inst_t>::iterator i = awaiting_map.begin();
i != awaiting_map.end();
i++)
- send_full(i->first, i->second);
+ send_full(*i);
awaiting_map.clear();
}
-void MDSMonitor::send_latest(msg_addr_t dest, const entity_inst_t& inst)
+void MDSMonitor::send_latest(entity_inst_t dest)
{
// FIXME: check if we're locked, etc.
if (mdsmap.get_epoch() > 0)
- send_full(dest, inst);
+ send_full(dest);
else
- awaiting_map[dest] = inst;
+ awaiting_map.push_back(dest);
+}
+
+
+void MDSMonitor::tick()
+{
+ // make sure mds's are still alive
+ utime_t now = g_clock.now();
+ if (now > g_conf.mds_beacon_grace) {
+ utime_t cutoff = now;
+ cutoff -= g_conf.mds_beacon_grace;
+
+ bool changed = false;
+
+ set<int> up;
+ mdsmap.get_up_mds_set(up);
+
+ for (set<int>::iterator p = up.begin();
+ p != up.end();
+ ++p) {
+ if (last_beacon.count(*p)) {
+ if (last_beacon[*p] < cutoff) {
+
+ // failure!
+ int newstate;
+ switch (mdsmap.get_state(*p)) {
+ case MDSMap::STATE_CREATING:
+ // didn't finish creating
+ newstate = MDSMap::STATE_DNE;
+ break;
+
+ case MDSMap::STATE_STANDBY:
+ if (mdsmap.has_created(*p))
+ newstate = MDSMap::STATE_OUT;
+ else
+ newstate = MDSMap::STATE_DNE;
+ break;
+
+ case MDSMap::STATE_REPLAY:
+ case MDSMap::STATE_REJOIN:
+ case MDSMap::STATE_ACTIVE:
+ case MDSMap::STATE_STOPPING:
+ newstate = MDSMap::STATE_FAILED;
+ break;
+
+ case MDSMap::STATE_STARTING:
+ case MDSMap::STATE_STOPPED:
+ newstate = MDSMap::STATE_OUT;
+ break;
+
+ default:
+ assert(0);
+ }
+
+ dout(10) << "no beacon from mds" << *p << " since " << last_beacon[*p]
+ << ", marking " << mdsmap.get_state_name(newstate)
+ << endl;
+
+ // update map
+ mdsmap.mds_state[*p] = newstate;
+ mdsmap.mds_state_seq.erase(*p);
+ changed = true;
+ }
+ } else {
+ dout(10) << "no beacons from mds" << *p << ", assuming one " << now << endl;
+ last_beacon[*p] = now;
+ }
+ }
+
+ if (changed) {
+ mdsmap.inc_epoch();
+ mdsmap.encode(maps[mdsmap.get_epoch()]);
+
+ print_map();
+
+ // bcast map
+ bcast_latest_mds();
+ send_current();
+ }
+ }
}
//map<epoch_t, bufferlist> inc_maps;
//MDSMap::Incremental pending_inc;
- map<msg_addr_t,entity_inst_t> awaiting_map;
-
+ list<entity_inst_t> awaiting_map;
+
+ // beacons
+ map<int, utime_t> last_beacon;
+
+ bool is_alive(int mds);
+
// maps
void create_initial();
void send_current(); // send current map to waiters.
- void send_full(msg_addr_t dest, const entity_inst_t& inst);
+ void send_full(entity_inst_t dest);
void bcast_latest_mds();
+ void print_map();
+
//void accept_pending(); // accept pending, new map.
//void send_incremental(epoch_t since, msg_addr_t dest);
- void handle_mds_boot(class MMDSBoot *m);
- void handle_mds_failure(class MMDSFailure *m);
+ void handle_mds_state(class MMDSState *m);
+ void handle_mds_beacon(class MMDSBeacon *m);
+ //void handle_mds_failure(class MMDSFailure *m);
void handle_mds_getmap(class MMDSGetMap *m);
- void handle_mds_shutdown(Message *m);
void dispatch(Message *m);
void tick(); // check state, take actions
- void send_latest(msg_addr_t dest, const entity_inst_t& inst);
+ void send_latest(entity_inst_t dest);
};
return last_mon;
}
- const entity_inst_t get_inst(int m) {
+ const entity_inst_t &get_inst(int m) {
assert(m < num_mon);
return mon_inst[m];
}
void Monitor::init()
{
+ lock.Lock();
+
dout(1) << "init" << endl;
// store
q.insert(whoami);
win_election(q);
}
+
+ lock.Unlock();
}
void Monitor::shutdown()
{
dout(1) << "shutdown" << endl;
+ // cancel all events
cancel_tick();
-
+ timer.cancel_all();
+ timer.join();
+
+ // unmount my local storage
if (store)
delete store;
if (osdmon->osdmap.is_down(*it)) continue;
dout(10) << "sending shutdown to osd" << *it << endl;
messenger->send_message(new MGenericMessage(MSG_SHUTDOWN),
- MSG_ADDR_OSD(*it), osdmon->osdmap.get_inst(*it));
+ osdmon->osdmap.get_inst(*it));
}
// monitors too.
for (int i=0; i<monmap->num_mon; i++)
if (i != whoami)
messenger->send_message(new MGenericMessage(MSG_SHUTDOWN),
- MSG_ADDR_MON(i), monmap->get_inst(i));
+ monmap->get_inst(i));
// clean up
if (monmap) delete monmap;
break;
case MSG_SHUTDOWN:
- if (m->get_source().is_mds()) {
- mdsmon->dispatch(m);
- if (mdsmon->mdsmap.get_num_mds() == 0)
- shutdown();
- }
- else if (m->get_source().is_osd()) {
- osdmon->dispatch(m);
- }
+ assert(m->get_source().is_osd());
+ osdmon->dispatch(m);
break;
// MDSs
- case MSG_MDS_BOOT:
+ case MSG_MDS_BEACON:
case MSG_MDS_GETMAP:
mdsmon->dispatch(m);
+
+ // hackish: did all mds's shut down?
+ if (g_conf.mon_stop_with_last_mds &&
+ mdsmon->mdsmap.get_num_up_or_failed_mds() == 0)
+ shutdown();
+
break;
// clients
-/************ TIMER ***************/
+/************ TICK ***************/
class C_Mon_Tick : public Context {
Monitor *mon;
public:
C_Mon_Tick(Monitor *m) : mon(m) {}
void finish(int r) {
- mon->tick(this);
+ mon->tick();
}
};
-
void Monitor::cancel_tick()
{
- if (!tick_timer) return;
-
- if (g_timer.cancel_event(tick_timer)) {
- dout(10) << "cancel_tick canceled" << endl;
- } else {
- // already dispatched!
- dout(10) << "cancel_tick timer dispatched, waiting to cancel" << endl;
- tick_timer = (Context*)1; // hackish.
- while (tick_timer)
- tick_timer_cond.Wait(lock);
- }
+ if (tick_timer) timer.cancel_event(tick_timer);
}
void Monitor::reset_tick()
{
- if (tick_timer)
- cancel_tick();
+ cancel_tick();
tick_timer = new C_Mon_Tick(this);
- g_timer.add_event_after(g_conf.mon_tick_interval, tick_timer);
+ timer.add_event_after(g_conf.mon_tick_interval, tick_timer);
}
-void Monitor::tick(Context *timer)
+void Monitor::tick()
{
- lock.Lock();
- {
- if (tick_timer != timer) {
- dout(10) << "tick - canceled" << endl;
- tick_timer = 0;
- tick_timer_cond.Signal();
- lock.Unlock();
- return;
- }
-
- tick_timer = 0;
-
- // ok go.
- dout(10) << "tick" << endl;
+ tick_timer = 0;
- osdmon->tick();
-
- // next tick!
- reset_tick();
- }
- lock.Unlock();
+ // ok go.
+ dout(11) << "tick" << endl;
+
+ osdmon->tick();
+ mdsmon->tick();
+
+ // next tick!
+ reset_tick();
}
#include "include/types.h"
#include "msg/Messenger.h"
+#include "common/Timer.h"
+
#include "MonMap.h"
#include "Elector.h"
#include "Paxos.h"
MonMap *monmap;
// timer.
+ SafeTimer timer;
Context *tick_timer;
- Cond tick_timer_cond;
void cancel_tick();
void reset_tick();
friend class C_Mon_Tick;
whoami(w),
messenger(m),
monmap(mm),
- tick_timer(0),
+ timer(lock), tick_timer(0),
store(0),
elector(this, w),
mon_epoch(0),
void init();
void shutdown();
void dispatch(Message *m);
- void tick(Context *timer);
+ void tick();
};
// tell a random osd
int osd = rand() % g_conf.num_osd;
send_incremental(osdmap.get_epoch()-1, // ick! FIXME
- MSG_ADDR_OSD(osd), osdmap.get_inst(osd));
+ osdmap.get_inst(osd));
}
accept_pending();
// tell him!
- send_incremental(osdmap.get_epoch()-1, MSG_ADDR_OSD(r), osdmap.get_inst(r));
+ send_incremental(osdmap.get_epoch()-1, osdmap.get_inst(r));
// do it again?
/*
if (g_conf.num_osd - d > 4 &&
g_conf.num_osd - d > g_conf.num_osd/2)
- g_timer.add_event_after(g_conf.fake_osdmap_expand,
+ mon->timer.add_event_after(g_conf.fake_osdmap_expand,
new C_Mon_Faker(this));
*/
}
i != g_fake_osd_down.end();
i++) {
dout(0) << "will fake osd" << i->first << " DOWN after " << i->second << endl;
- g_timer.add_event_after(i->second, new C_Mon_FakeOSDFailure(this, i->first, 1));
+ mon->timer.add_event_after(i->second, new C_Mon_FakeOSDFailure(this, i->first, 1));
}
for (map<int,float>::iterator i = g_fake_osd_out.begin();
i != g_fake_osd_out.end();
i++) {
dout(0) << "will fake osd" << i->first << " OUT after " << i->second << endl;
- g_timer.add_event_after(i->second, new C_Mon_FakeOSDFailure(this, i->first, 0));
+ mon->timer.add_event_after(i->second, new C_Mon_FakeOSDFailure(this, i->first, 0));
}
}
// FIXME
// take their word for it
- int from = m->get_failed().num();
+ int from = m->get_failed().name.num();
if (osdmap.is_up(from) &&
(osdmap.osd_inst.count(from) == 0 ||
- osdmap.osd_inst[from] == m->get_inst())) {
- pending_inc.new_down[from] = m->get_inst();
+ osdmap.osd_inst[from] == m->get_failed())) {
+ pending_inc.new_down[from] = m->get_failed();
if (osdmap.is_in(from))
down_pending_out[from] = g_clock.now();
accept_pending();
- send_incremental(m->get_epoch(), m->get_source(), m->get_source_inst());
+ send_incremental(m->get_epoch(), m->get_source_inst());
send_waiting();
bcast_latest_mds();
void OSDMonitor::fake_osd_failure(int osd, bool down)
{
- lock.Lock();
- {
- if (down) {
- dout(1) << "fake_osd_failure DOWN osd" << osd << endl;
- pending_inc.new_down[osd] = osdmap.osd_inst[osd];
- } else {
- dout(1) << "fake_osd_failure OUT osd" << osd << endl;
- pending_inc.new_out.push_back(osd);
- }
- accept_pending();
- bcast_latest_osd();
- bcast_latest_mds();
+ if (down) {
+ dout(1) << "fake_osd_failure DOWN osd" << osd << endl;
+ pending_inc.new_down[osd] = osdmap.osd_inst[osd];
+ } else {
+ dout(1) << "fake_osd_failure OUT osd" << osd << endl;
+ pending_inc.new_out.push_back(osd);
}
- lock.Unlock();
+ accept_pending();
+ bcast_latest_osd();
+ bcast_latest_mds();
}
accept_pending();
// the booting osd will spread word
- send_incremental(m->sb.current_epoch, m->get_source(), m->get_source_inst());
+ send_incremental(m->sb.current_epoch, m->get_source_inst());
delete m;
// tell mds
if (osdmap.is_out(from))
pending_inc.new_in.push_back(from);
accept_pending();
- send_incremental(m->map_epoch, m->get_source(), m->get_source_inst());
+ send_incremental(m->map_epoch, m->get_source_inst());
}
void OSDMonitor::handle_osd_out(MOSDOut *m)
if (osdmap.is_in(from)) {
pending_inc.new_out.push_back(from);
accept_pending();
- send_incremental(m->map_epoch, m->get_source(), m->get_source_inst());
+ send_incremental(m->map_epoch, m->get_source_inst());
}
}
awaiting_map[m->get_source()].second = m->get_since();
} else {
//if (m->get_since())
- send_incremental(m->get_since(), m->get_source(), m->get_source_inst());
+ send_incremental(m->get_since(), m->get_source_inst());
//else
//send_full(m->get_source(), m->get_source_inst());
}
i++) {
dout(0) << "osd" << i->first << " UP " << i->second << endl;
derr(0) << "osd" << i->first << " UP " << i->second << endl;
- messenger->mark_up(MSG_ADDR_OSD(i->first), i->second);
}
for (map<int,entity_inst_t>::iterator i = pending_inc.new_down.begin();
i != pending_inc.new_down.end();
i++) {
dout(0) << "osd" << i->first << " DOWN " << i->second << endl;
derr(0) << "osd" << i->first << " DOWN " << i->second << endl;
- messenger->mark_down(MSG_ADDR_OSD(i->first), i->second);
+ messenger->mark_down(i->second.addr);
}
for (list<int>::iterator i = pending_inc.new_in.begin();
i != pending_inc.new_in.end();
{
dout(10) << "send_waiting " << osdmap.get_epoch() << endl;
- for (map<msg_addr_t,pair<entity_inst_t,epoch_t> >::iterator i = awaiting_map.begin();
+ for (map<entity_name_t,pair<entity_inst_t,epoch_t> >::iterator i = awaiting_map.begin();
i != awaiting_map.end();
i++)
- send_incremental(i->second.second, i->first, i->second.first);
+ send_incremental(i->second.second, i->second.first);
}
-void OSDMonitor::send_full(msg_addr_t who, const entity_inst_t& inst)
+void OSDMonitor::send_full(entity_inst_t who)
{
- messenger->send_message(new MOSDMap(&osdmap), who, inst);
+ messenger->send_message(new MOSDMap(&osdmap), who);
}
-void OSDMonitor::send_incremental(epoch_t since, msg_addr_t dest, const entity_inst_t& inst)
+void OSDMonitor::send_incremental(epoch_t since, entity_inst_t dest)
{
dout(5) << "osd_send_incremental " << since << " -> " << osdmap.get_epoch()
<< " to " << dest << endl;
}
}
- messenger->send_message(m, dest, inst);
+ messenger->send_message(m, dest);
}
dout(1) << "bcast_latest_mds epoch " << e << endl;
// tell mds
- for (set<int>::iterator i = mon->mdsmon->mdsmap.get_mds().begin();
- i != mon->mdsmon->mdsmap.get_mds().end();
+ set<int> up;
+ mon->mdsmon->mdsmap.get_up_mds_set(up);
+ for (set<int>::iterator i = up.begin();
+ i != up.end();
i++) {
- if (mon->mdsmon->mdsmap.is_down(*i)) continue;
- send_incremental(osdmap.get_epoch()-1, MSG_ADDR_MDS(*i), mon->mdsmon->mdsmap.get_inst(*i));
+ send_incremental(osdmap.get_epoch()-1, mon->mdsmon->mdsmap.get_inst(*i));
}
}
it++) {
if (osdmap.is_down(*it)) continue;
- send_incremental(osdmap.get_epoch()-1, MSG_ADDR_OSD(*it), osdmap.get_inst(*it));
+ send_incremental(osdmap.get_epoch()-1, osdmap.get_inst(*it));
}
}
else if (mon->is_peon()) {
// peon. send info
//messenger->send_message(new MMonOSDMapInfo(osdmap.epoch, osdmap.mon_epoch),
- //MSG_ADDR_MON(mon->leader), mon->monmap->get_inst(mon->leader));
+ // mon->monmap->get_inst(mon->leader));
}
}
// bring up to date
if (epoch < osdmap.get_epoch())
- send_incremental(epoch, m->get_source(), m->get_source_inst());
+ send_incremental(epoch, m->get_source_inst());
delete m;
}
i++) {
if (*i == mon->whoami) continue;
messenger->send_message(new MMonOSDMapLease(osdmap.get_epoch(), lease_expire),
- MSG_ADDR_MON(*i), mon->monmap->get_inst(*i));
+ mon->monmap->get_inst(*i));
pending_ack.insert(*i);
}
}
if (*i == mon->whoami) continue;
messenger->send_message(new MMonOSDMapUpdatePrepare(epoch,
map_bl, inc_map_bl),
- MSG_ADDR_MON(*i), mon->monmap->get_inst(*i));
+ mon->monmap->get_inst(*i));
pending_ack.insert(*i);
}
}
// ack
messenger->send_message(new MMonOSDMapUpdateAck(osdmap.get_epoch()),
- m->get_source(), m->get_source_inst());
+ m->get_source_inst());
delete m;
}
OSDMap osdmap;
private:
- map<msg_addr_t, pair<entity_inst_t, epoch_t> > awaiting_map;
+ map<entity_name_t, pair<entity_inst_t, epoch_t> > awaiting_map;
void create_initial();
bool get_map_bl(epoch_t epoch, bufferlist &bl);
// maps
void accept_pending(); // accept pending, new map.
void send_waiting(); // send current map to waiters.
- void send_full(msg_addr_t dest, const entity_inst_t& inst);
- void send_incremental(epoch_t since, msg_addr_t dest, const entity_inst_t& inst);
+ void send_full(entity_inst_t dest);
+ void send_incremental(epoch_t since, entity_inst_t dest);
void bcast_latest_mds();
void bcast_latest_osd();
// .. do something else too
version_t pn = get_new_proposal_number();
for (int i=0; i<mon->monmap->num_mon; ++i) {
- if (i == whoami) continue;
- // todo high rf I pass the pn twice... what is the last parameter for?
- mon->messenger->send_message(new MMonPaxos(MMonPaxos::OP_COLLECT, whoami, pn, pn),
- MSG_ADDR_MON(i), mon->monmap->get_inst(i));
+ if (i == whoami) continue;
+ // todo high rf I pass the pn twice... what is the last parameter for?
+ mon->messenger->send_message(new MMonPaxos(MMonPaxos::OP_COLLECT, whoami, pn, pn),
+ mon->monmap->get_inst(i));
}
}
switch (m->get_type()) {
case MSG_MON_PAXOS:
- {
- MMonPaxos *pm = (MMonPaxos*)m;
-
- // NOTE: these ops are defined in messages/MMonPaxos.h
- switch (pm->op) {
- // learner
- case MMonPaxos::OP_COLLECT:
- handle_collect(pm);
- break;
-
- case MMonPaxos::OP_LAST:
- handle_last(pm);
- break;
-
- case MMonPaxos::OP_OLDROUND:
- handle_old_round(pm);
- break;
-
- case MMonPaxos::OP_BEGIN:
- handle_begin(pm);
- break;
-
- case MMonPaxos::OP_ACCEPT:
- handle_accept(pm);
- break;
-
- case MMonPaxos::OP_SUCCESS:
- handle_success(pm);
- break;
-
- case MMonPaxos::OP_ACK:
- handle_ack(pm);
- break;
-
- default:
- assert(0);
- }
- }
+ {
+ MMonPaxos *pm = (MMonPaxos*)m;
+
+ // NOTE: these ops are defined in messages/MMonPaxos.h
+ switch (pm->op) {
+ // learner
+ case MMonPaxos::OP_COLLECT:
+ handle_collect(pm);
+ break;
+
+ case MMonPaxos::OP_LAST:
+ handle_last(pm);
+ break;
+
+ case MMonPaxos::OP_OLDROUND:
+ handle_old_round(pm);
+ break;
+
+ case MMonPaxos::OP_BEGIN:
+ handle_begin(pm);
+ break;
+
+ case MMonPaxos::OP_ACCEPT:
+ handle_accept(pm);
+ break;
+
+ case MMonPaxos::OP_SUCCESS:
+ handle_success(pm);
break;
+ case MMonPaxos::OP_ACK:
+ handle_ack(pm);
+ break;
+
+ default:
+ assert(0);
+ }
+ }
+ break;
+
default:
- assert(0);
+ assert(0);
}
}
// how i receive messages
virtual void dispatch(Message *m) = 0;
-
// how i deal with transmission failures.
- virtual void ms_handle_failure(Message *m, msg_addr_t dest, const entity_inst_t& inst) { delete m; }
-
- // lookups
- virtual bool ms_lookup(msg_addr_t dest, entity_inst_t& inst) { assert(0); return 0; }
-
- // this is how i send messages
- //int send_message(Message *m, msg_addr_t dest, int dest_port);
+ virtual void ms_handle_failure(Message *m, entity_name_t dest, const entity_addr_t& addr) { delete m; }
};
#endif
int nranks = 0; // this identify each entity_inst_t
-map<int, FakeMessenger*> directory;
+map<entity_addr_t, FakeMessenger*> directory;
hash_map<int, Logger*> loggers;
LogType fakemsg_logtype;
-set<int> shutdown_set;
+set<entity_addr_t> shutdown_set;
Mutex lock;
Cond cond;
-bool pending_timer = false;
-
bool awake = false;
bool fm_shutdown = false;
pthread_t thread_id;
-class C_FakeKicker : public Context {
- void finish(int r) {
- dout(18) << "timer kick" << endl;
- pending_timer = true;
- lock.Lock();
- cond.Signal(); // why not
- lock.Unlock();
- }
-};
-
-void FakeMessenger::callback_kick()
-{
- pending_timer = true;
- lock.Lock();
- cond.Signal(); // why not
- lock.Unlock();
-}
void *fakemessenger_thread(void *ptr)
{
- //dout(1) << "thread start, setting timer kicker" << endl;
- //g_timer.set_messenger_kicker(new C_FakeKicker());
- //msgr_callback_kicker = new C_FakeKicker();
-
lock.Lock();
while (1) {
dout(20) << "thread waiting" << endl;
}
lock.Unlock();
- //cout << "unsetting messenger" << endl;
- //g_timer.unset_messenger_kicker();
- //g_timer.unset_messenger();
- //msgr_callback_kicker = 0;
-
dout(1) << "thread finish (i woke up but no messages, bye)" << endl;
return 0;
}
dout(18) << "do_loop top" << endl;
- /*// timer?
- if (pending_timer) {
- pending_timer = false;
- dout(5) << "pending timer" << endl;
- g_timer.execute_pending();
- }
- */
-
- // callbacks
- lock.Unlock();
- Messenger::do_callbacks();
- lock.Lock();
-
// messages
- map<int, FakeMessenger*>::iterator it = directory.begin();
+ map<entity_addr_t, FakeMessenger*>::iterator it = directory.begin();
while (it != directory.end()) {
FakeMessenger *mgr = it->second;
- dout(18) << "messenger " << mgr << " at " << mgr->get_myaddr() << " has " << mgr->num_incoming() << " queued" << endl;
+ dout(18) << "messenger " << mgr << " at " << mgr->get_myname() << " has " << mgr->num_incoming() << " queued" << endl;
if (!mgr->is_ready()) {
- dout(18) << "messenger " << mgr << " at " << mgr->get_myaddr() << " has no dispatcher, skipping" << endl;
+ dout(18) << "messenger " << mgr << " at " << mgr->get_myname() << " has no dispatcher, skipping" << endl;
it++;
continue;
}
if (m) {
//dout(18) << "got " << m << endl;
- dout(1) << "---- '" << m->get_type_name()
- << "' from " << m->get_source() // << ':' << m->get_source_port()
- << " to " << m->get_dest() //<< ':' << m->get_dest_port()
- << " ---- " << m
+ dout(1) << "---- " << m->get_dest()
+ << " <- " << m->get_source()
+ << " ---- " << *m
<< endl;
if (g_conf.fakemessenger_serialize) {
// deal with shutdowns.. dleayed to avoid concurrent directory modification
if (!shutdown_set.empty()) {
- for (set<int>::iterator it = shutdown_set.begin();
+ for (set<entity_addr_t>::iterator it = shutdown_set.begin();
it != shutdown_set.end();
it++) {
dout(7) << "fakemessenger: removing " << *it << " from directory" << endl;
}
-FakeMessenger::FakeMessenger(msg_addr_t me) : Messenger(me)
+FakeMessenger::FakeMessenger(entity_name_t me) : Messenger(me)
{
- entity_inst_t fakeinst;
lock.Lock();
{
// assign rank
- fakeinst.addr.sin_port =
- fakeinst.rank = nranks++;
- set_myinst(fakeinst);
+ _myinst.name = me;
+ _myinst.addr.nonce = nranks++;
// add to directory
- directory[ fakeinst.rank ] = this;
+ directory[ _myinst.addr ] = this;
}
lock.Unlock();
- cout << "fakemessenger " << get_myaddr() << " messenger is " << this << " at " << fakeinst << endl;
-
- //g_timer.set_messenger(this);
+ cout << "fakemessenger " << get_myname() << " messenger is " << this << " at " << _myinst << endl;
qlen = 0;
{
//cout << "shutdown on messenger " << this << " has " << num_incoming() << " queued" << endl;
lock.Lock();
- assert(directory.count(get_myinst().rank) == 1);
- shutdown_set.insert(get_myinst().rank);
+ assert(directory.count(_myinst.addr) == 1);
+ shutdown_set.insert(_myinst.addr);
/*
directory.erase(myaddr);
return 0;
}
-/*
-void FakeMessenger::trigger_timer(Timer *t)
-{
- // note timer to call
- pending_timer = t;
-
- // wake up thread?
- cond.Signal(); // why not
-}
-*/
-void FakeMessenger::reset_myaddr(msg_addr_t m)
+void FakeMessenger::reset_myname(entity_name_t m)
{
- dout(1) << "reset_myaddr from " << get_myaddr() << " to " << m << endl;
- _set_myaddr(m);
+ dout(1) << "reset_myname from " << get_myname() << " to " << m << endl;
+ _set_myname(m);
+
+ directory.erase(_myinst.addr);
+ _myinst.name = m;
+ directory[_myinst.addr] = this;
+
}
-int FakeMessenger::send_message(Message *m, msg_addr_t dest, entity_inst_t inst, int port, int fromport)
+int FakeMessenger::send_message(Message *m, entity_inst_t inst, int port, int fromport)
{
- m->set_source(get_myaddr(), fromport);
- m->set_dest(dest, port);
- //m->set_lamport_send_stamp( get_lamport() );
+ entity_name_t dest = inst.name;
+
+ m->set_source(get_myname(), fromport);
+ m->set_source_addr(get_myaddr());
- m->set_source_inst(get_myinst());
+ m->set_dest(inst.name, port);
lock.Lock();
#endif
// queue
- FakeMessenger *dm = directory[inst.rank];
+ FakeMessenger *dm = directory[inst.addr];
if (!dm) {
- dout(1) << "** destination " << dest << " (" << inst << ") dne" << endl;
+ dout(1) << "** destination " << inst << " dne" << endl;
+ for (map<entity_addr_t, FakeMessenger*>::iterator p = directory.begin();
+ p != directory.end();
+ ++p) {
+ dout(1) << "** have " << p->first << " to " << p->second << endl;
+ }
assert(dm);
}
dm->queue_incoming(m);
- dout(1) << "--> " << get_myaddr() << " sending " << m << " '" << m->get_type_name() << "'"
- << " to " << dest
- << endl;//" m " << dm << " has " << dm->num_incoming() << " queued" << endl;
+ dout(1) << "--> " << get_myname() << " -> " << inst.name << " " << *m << endl;
}
catch (...) {
int qlen;
list<Message*> incoming; // incoming queue
+ entity_inst_t _myinst;
+
public:
- FakeMessenger(msg_addr_t me);
+ FakeMessenger(entity_name_t me);
~FakeMessenger();
virtual int shutdown();
- void reset_myaddr(msg_addr_t m);
+ const entity_inst_t& get_myinst() {
+ return _myinst;
+ };
+ const entity_addr_t& get_myaddr() {
+ return _myinst.addr;
+ }
+
+ void reset_myname(entity_name_t m);
// msg interface
- virtual int send_message(Message *m, msg_addr_t dest, entity_inst_t inst, int port=0, int fromport=0);
+ virtual int send_message(Message *m, entity_inst_t dest, int port=0, int fromport=0);
// events
//virtual void trigger_timer(Timer *t);
int get_dispatch_queue_len() { return qlen; }
- void callback_kick();
-
// -- incoming queue --
// (that nothing uses)
Message *get_message() {
// take note of a live host
-void HostMonitor::host_is_alive(msg_addr_t host)
+void HostMonitor::host_is_alive(entity_name_t host)
{
if (hosts.count(host))
status[host].last_heard_from = g_clock.gettime();
// send out pings
inflight_pings.clear();
- for (set<msg_addr_t>::iterator it = hosts.begin();
+ for (set<entity_name_t>::iterator it = hosts.begin();
it != hosts.end();
it++) {
// have i heard from them recently?
dout(DBL) << "check_heartbeat()" << endl;
// check inflight pings
- for (set<msg_addr_t>::iterator it = inflight_pings.begin();
+ for (set<entity_name_t>::iterator it = inflight_pings.begin();
it != inflight_pings.end();
it++) {
status[*it].num_heartbeats_missed++;
void HostMonitor::handle_ping_ack(MPingAck *m)
{
- msg_addr_t from = m->get_source();
+ entity_name_t from = m->get_source();
dout(DBL) << "ping ack from " << from << endl;
status[from].last_pinged = g_clock.gettime();
// FIXME: this doesn't handle failed -> alive transitions gracefully at all..
// the higher-up's acknowledged our failure notification, we can stop resending it.
- msg_addr_t failed = m->get_failed();
+ entity_name_t failed = m->get_failed();
dout(DBL) << "handle_failure_ack " << failed << endl;
unacked_failures.erase(failed);
acked_failures.insert(failed);
string whoami;
// hosts i monitor
- set<msg_addr_t> hosts;
+ set<entity_name_t> hosts;
// who i tell when they fail
- set<msg_addr_t> notify;
+ set<entity_name_t> notify;
int notify_port;
// their status
- map<msg_addr_t,monitor_rec_t> status;
+ map<entity_name_t,monitor_rec_t> status;
- set<msg_addr_t> inflight_pings; // pings we sent that haven't replied yet
+ set<entity_name_t> inflight_pings; // pings we sent that haven't replied yet
- set<msg_addr_t> unacked_failures; // failed hosts that haven't been acked yet.
- set<msg_addr_t> acked_failures; // these failures have been acked.
+ set<entity_name_t> unacked_failures; // failed hosts that haven't been acked yet.
+ set<entity_name_t> acked_failures; // these failures have been acked.
float heartbeat_interval; // how often to do a heartbeat
float max_ping_time; // how long before it's a miss
this->whoami = whoami;
notify_port = 0;
}
- set<msg_addr_t>& get_hosts() { return hosts; }
- set<msg_addr_t>& get_notify() { return notify; }
+ set<entity_name_t>& get_hosts() { return hosts; }
+ set<entity_name_t>& get_notify() { return notify; }
void set_notify_port(int p) { notify_port = p; }
- void remove_host(msg_addr_t h) {
+ void remove_host(entity_name_t h) {
hosts.erase(h);
status.erase(h);
unacked_failures.erase(h);
void init();
void shutdown();
- void host_is_alive(msg_addr_t who);
+ void host_is_alive(entity_name_t who);
void proc_message(Message *m);
void handle_ping_ack(class MPingAck *m);
}
};
-MPIMessenger::MPIMessenger(msg_addr_t myaddr) : Messenger(myaddr)
+MPIMessenger::MPIMessenger(entity_name_t myaddr) : Messenger(myaddr)
{
// my address
this->myaddr = myaddr;
/* note: send_message _MUST_ be non-blocking */
-int MPIMessenger::send_message(Message *m, msg_addr_t dest, int port, int fromport)
+int MPIMessenger::send_message(Message *m, entity_name_t dest, int port, int fromport)
{
// set envelope
m->set_source(myaddr, fromport);
class MPIMessenger : public Messenger {
protected:
- msg_addr_t myaddr; // my address
+ entity_name_t myaddr; // my address
//class Logger *logger; // for logging
public:
- MPIMessenger(msg_addr_t myaddr);
+ MPIMessenger(entity_name_t myaddr);
~MPIMessenger();
// init, shutdown MPI and associated event loop thread.
virtual int shutdown();
// message interface
- virtual int send_message(Message *m, msg_addr_t dest, int port=0, int fromport=0);
+ virtual int send_message(Message *m, entity_name_t dest, int port=0, int fromport=0);
};
/**
}
// send a request and wait for the response
-Message *MTMessenger::sendrecv(Message *m, msg_addr_t dest)
+Message *MTMessenger::sendrecv(Message *m, entity_name_t dest)
{
int dest_tag = 0; // servers listen for any tag
int my_tag = get_tag();
~MTMessenger();
// send a request to a server and wait (block) for the response;
- virtual Message *sendrecv(Message *m, msg_addr_t dest);
+ virtual Message *sendrecv(Message *m, entity_name_t dest);
// wait (block) for a request from anyone
Message *recvreq();
#include "messages/MGenericMessage.h"
+/*
#include "messages/MNSConnect.h"
#include "messages/MNSConnectAck.h"
#include "messages/MNSRegister.h"
#include "messages/MNSLookup.h"
#include "messages/MNSLookupReply.h"
#include "messages/MNSFailure.h"
+*/
#include "messages/MMonPaxos.h"
#include "messages/MPing.h"
#include "messages/MPingAck.h"
-#include "messages/MFailure.h"
-#include "messages/MFailureAck.h"
+//#include "messages/MFailure.h"
+//#include "messages/MFailureAck.h"
#include "messages/MOSDBoot.h"
#include "messages/MOSDIn.h"
#include "messages/MMDSGetMap.h"
#include "messages/MMDSMap.h"
-#include "messages/MMDSBoot.h"
+#include "messages/MMDSBeacon.h"
+#include "messages/MMDSImportMap.h"
+#include "messages/MMDSCacheRejoin.h"
+#include "messages/MMDSCacheRejoinAck.h"
#include "messages/MDirUpdate.h"
#include "messages/MDiscover.h"
// -- with payload --
+ /*
case MSG_NS_CONNECT:
m = new MNSConnect();
break;
case MSG_NS_FAILURE:
m = new MNSFailure();
break;
+ */
case MSG_MON_PAXOS:
m = new MMonPaxos;
case MSG_PING_ACK:
m = new MPingAck();
break;
+ /*
case MSG_FAILURE:
m = new MFailure();
break;
case MSG_FAILURE_ACK:
m = new MFailureAck();
break;
+ */
case MSG_OSD_BOOT:
m = new MOSDBoot();
case MSG_MDS_MAP:
m = new MMDSMap();
break;
- case MSG_MDS_BOOT:
- m = new MMDSBoot();
+ case MSG_MDS_BEACON:
+ m = new MMDSBeacon;
+ break;
+ case MSG_MDS_IMPORTMAP:
+ m = new MMDSImportMap;
+ break;
+ case MSG_MDS_CACHEREJOIN:
+ m = new MMDSCacheRejoin;
+ break;
+ case MSG_MDS_CACHEREJOINACK:
+ m = new MMDSCacheRejoinAck;
break;
case MSG_MDS_DIRUPDATE:
*
*/
-
-
#ifndef __MESSAGE_H
#define __MESSAGE_H
// *** MDS ***
-#define MSG_MDS_BOOT 100
-#define MSG_MDS_GETMAP 101
-#define MSG_MDS_MAP 102
-#define MSG_MDS_HEARTBEAT 103
+#define MSG_MDS_GETMAP 102
+#define MSG_MDS_MAP 103
+#define MSG_MDS_HEARTBEAT 104 // for mds load balancer
+#define MSG_MDS_BEACON 105 // to monitor
+
+#define MSG_MDS_IMPORTMAP 106
+#define MSG_MDS_CACHEREJOIN 107
+#define MSG_MDS_CACHEREJOINACK 108
#define MSG_MDS_DISCOVER 110
#define MSG_MDS_DISCOVERREPLY 111
using __gnu_cxx::crope;
+#include "include/types.h"
#include "include/buffer.h"
+#include "msg_types.h"
-#include "tcp.h"
-
-
-
-
-// use fixed offsets and static entity -> logical addr mapping!
-#define MSG_ADDR_NAMER_BASE 0
-#define MSG_ADDR_RANK_BASE 1
-#define MSG_ADDR_MDS_BASE 2
-#define MSG_ADDR_OSD_BASE 3
-#define MSG_ADDR_MON_BASE 4
-#define MSG_ADDR_CLIENT_BASE 5
-
-#define MSG_ADDR_NEW -1
-
-
-// new typed msg_addr_t way!
-class msg_addr_t {
-public:
- int _type;
- int _num;
-
- msg_addr_t() : _type(0), _num(0) {}
- msg_addr_t(int t, int n) : _type(t), _num(n) {}
-
- int num() const { return _num; }
- int type() const { return _type; }
- const char *type_str() const {
- switch (type()) {
- case MSG_ADDR_RANK_BASE: return "rank";
- case MSG_ADDR_MDS_BASE: return "mds";
- case MSG_ADDR_OSD_BASE: return "osd";
- case MSG_ADDR_MON_BASE: return "mon";
- case MSG_ADDR_CLIENT_BASE: return "client";
- case MSG_ADDR_NAMER_BASE: return "namer";
- }
- return "unknown";
- }
-
- bool is_new() const { return num() == MSG_ADDR_NEW; }
-
- bool is_client() const { return type() == MSG_ADDR_CLIENT_BASE; }
- bool is_mds() const { return type() == MSG_ADDR_MDS_BASE; }
- bool is_osd() const { return type() == MSG_ADDR_OSD_BASE; }
- bool is_mon() const { return type() == MSG_ADDR_MON_BASE; }
- bool is_namer() const { return type() == MSG_ADDR_NAMER_BASE; }
-};
-
-inline bool operator== (const msg_addr_t& l, const msg_addr_t& r) { return (l._type == r._type) && (l._num == r._num); }
-inline bool operator!= (const msg_addr_t& l, const msg_addr_t& r) { return (l._type != r._type) || (l._num != r._num); }
-inline bool operator< (const msg_addr_t& l, const msg_addr_t& r) { return (l._type < r._type) || (l._type == r._type && l._num < r._num); }
-
-inline std::ostream& operator<<(std::ostream& out, const msg_addr_t& addr) {
- //if (addr.is_namer()) return out << "namer";
- if (addr.is_new() || addr.num() < 0)
- return out << addr.type_str() << "?";
- else
- return out << addr.type_str() << addr.num();
-}
-
-namespace __gnu_cxx {
- template<> struct hash< msg_addr_t >
- {
- size_t operator()( const msg_addr_t m ) const
- {
- static hash<int> H;
- return H(m.type() ^ m.num());
- }
- };
-}
-
-#define MSG_ADDR_RANK(x) msg_addr_t(MSG_ADDR_RANK_BASE,x)
-#define MSG_ADDR_MDS(x) msg_addr_t(MSG_ADDR_MDS_BASE,x)
-#define MSG_ADDR_OSD(x) msg_addr_t(MSG_ADDR_OSD_BASE,x)
-#define MSG_ADDR_MON(x) msg_addr_t(MSG_ADDR_MON_BASE,x)
-#define MSG_ADDR_CLIENT(x) msg_addr_t(MSG_ADDR_CLIENT_BASE,x)
-#define MSG_ADDR_NAMER(x) msg_addr_t(MSG_ADDR_NAMER_BASE,x)
-#define MSG_ADDR_UNDEF msg_addr_t()
-#define MSG_ADDR_DIRECTORY MSG_ADDR_NAMER(0)
-#define MSG_ADDR_RANK_NEW MSG_ADDR_RANK(MSG_ADDR_NEW)
-#define MSG_ADDR_MDS_NEW MSG_ADDR_MDS(MSG_ADDR_NEW)
-#define MSG_ADDR_OSD_NEW MSG_ADDR_OSD(MSG_ADDR_NEW)
-#define MSG_ADDR_CLIENT_NEW MSG_ADDR_CLIENT(MSG_ADDR_NEW)
-#define MSG_ADDR_NAMER_NEW MSG_ADDR_NAMER(MSG_ADDR_NEW)
-
-
-class entity_inst_t {
- public:
- tcpaddr_t addr;
- __int64_t rank;
-
- entity_inst_t() : rank(-1) {
- memset(&addr, 0, sizeof(addr));
- }
- entity_inst_t(tcpaddr_t& a, int r) : addr(a), rank(r) {
- memset(&addr, 0, sizeof(addr));
- }
-
- void set_addr(tcpaddr_t a) {
- addr = a;
-
- // figure out rank
- rank = *((unsigned*)&a.sin_addr.s_addr);
- rank |= (__uint64_t)a.sin_port << 32;
- }
-};
-
-inline bool operator==(const entity_inst_t& a, const entity_inst_t& b) { return a.rank == b.rank && a.addr == b.addr; }
-inline bool operator!=(const entity_inst_t& a, const entity_inst_t& b) { return !(a == b); }
-inline bool operator>(const entity_inst_t& a, const entity_inst_t& b) { return a.rank > b.rank; }
-inline bool operator>=(const entity_inst_t& a, const entity_inst_t& b) { return a.rank >= b.rank; }
-inline bool operator<(const entity_inst_t& a, const entity_inst_t& b) { return a.rank < b.rank; }
-inline bool operator<=(const entity_inst_t& a, const entity_inst_t& b) { return a.rank <= b.rank; }
-
-inline ostream& operator<<(ostream& out, const entity_inst_t &i)
-{
- //return out << "rank" << i.rank << "_" << i.addr;
- return out << i.addr;
-}
+// ======================================================
// abstract Message class
typedef struct {
int type;
- msg_addr_t source, dest;
- entity_inst_t source_inst;
+ entity_inst_t src, dst;
int source_port, dest_port;
int nchunks;
- __uint64_t lamport_send_stamp;
- __uint64_t lamport_recv_stamp;
} msg_envelope_t;
#define MSG_ENVELOPE_LEN sizeof(msg_envelope_t)
public:
Message() {
env.source_port = env.dest_port = -1;
- env.source = env.dest = MSG_ADDR_UNDEF;
env.nchunks = 0;
- env.lamport_send_stamp = 0;
- env.lamport_recv_stamp = 0;
};
Message(int t) {
env.source_port = env.dest_port = -1;
- env.source = env.dest = MSG_ADDR_UNDEF;
env.nchunks = 0;
env.type = t;
- env.lamport_send_stamp = 0;
- env.lamport_recv_stamp = 0;
}
virtual ~Message() {
}
- void set_lamport_send_stamp(__uint64_t t) { env.lamport_send_stamp = t; }
- void set_lamport_recv_stamp(__uint64_t t) { env.lamport_recv_stamp = t; }
- __uint64_t get_lamport_send_stamp() { return env.lamport_send_stamp; }
- __uint64_t get_lamport_recv_stamp() { return env.lamport_recv_stamp; }
-
// for rpc-type procedural messages (pcid = procedure call id)
virtual long get_pcid() { return 0; }
virtual char *get_type_name() = 0;
// source/dest
- msg_addr_t& get_dest() { return env.dest; }
- void set_dest(msg_addr_t a, int p) { env.dest = a; env.dest_port = p; }
+ entity_inst_t& get_dest_inst() { return env.dst; }
+ entity_inst_t& get_source_inst() { return env.src; }
+
+ entity_name_t& get_dest() { return env.dst.name; }
+ void set_dest(entity_name_t a, int p) { env.dst.name = a; env.dest_port = p; }
int get_dest_port() { return env.dest_port; }
- msg_addr_t& get_source() { return env.source; }
- void set_source(msg_addr_t a, int p) { env.source = a; env.source_port = p; }
+ entity_name_t& get_source() { return env.src.name; }
+ void set_source(entity_name_t a, int p) { env.src.name = a; env.source_port = p; }
int get_source_port() { return env.source_port; }
- entity_inst_t& get_source_inst() { return env.source_inst; }
- void set_source_inst(const entity_inst_t &i) { env.source_inst = i; }
+ entity_addr_t& get_source_addr() { return env.src.addr; }
+ void set_source_addr(const entity_addr_t &i) { env.src.addr = i; }
// PAYLOAD ----
void reset_payload() {
using namespace std;
-#include "config.h"
-#undef dout
-#define dout(l) if (l<=g_conf.debug) cout << "messenger: "
-#define DEBUGLVL 10 // debug level of output
-
-
-
-// --------
-// callbacks
-
-Mutex msgr_callback_lock;
-list<Context*> msgr_callback_queue;
-//Context* msgr_callback_kicker = 0;
-
-void Messenger::queue_callback(Context *c) {
- msgr_callback_lock.Lock();
- msgr_callback_queue.push_back(c);
- msgr_callback_lock.Unlock();
-
- callback_kick();
-}
-void Messenger::queue_callbacks(list<Context*>& ls) {
- msgr_callback_lock.Lock();
- msgr_callback_queue.splice(msgr_callback_queue.end(), ls);
- msgr_callback_lock.Unlock();
-
- callback_kick();
-}
-
-void Messenger::do_callbacks() {
- // take list
- msgr_callback_lock.Lock();
- list<Context*> ls;
- ls.splice(ls.begin(), msgr_callback_queue);
- msgr_callback_lock.Unlock();
-
- // do them
- for (list<Context*>::iterator it = ls.begin();
- it != ls.end();
- it++) {
- dout(10) << "--- doing callback " << *it << endl;
- (*it)->finish(0);
- delete *it;
- }
-}
-
// ---------
// incoming messages
#include "include/Context.h"
-typedef __uint64_t lamport_t;
-
class MDS;
class Timer;
class Messenger {
private:
Dispatcher *dispatcher;
- msg_addr_t _myaddr;
- entity_inst_t _myinst;
-
+ entity_name_t _myname;
public:
- Messenger(msg_addr_t w) : dispatcher(0), _myaddr(w) { }
+ Messenger(entity_name_t w) : dispatcher(0), _myname(w) { }
virtual ~Messenger() { }
- const entity_inst_t &get_myinst() { return _myinst; }
- void set_myinst(entity_inst_t& v) { _myinst = v; }
-
- msg_addr_t get_myaddr() { return _myaddr; }
- void _set_myaddr(msg_addr_t m) { _myaddr = m; }
+ // accessors
+ entity_name_t get_myname() { return _myname; }
+ void _set_myname(entity_name_t m) { _myname = m; }
- virtual void reset_myaddr(msg_addr_t m) = 0;
+ virtual void reset_myname(entity_name_t m) = 0;
+ virtual const entity_addr_t &get_myaddr() = 0;
- virtual int shutdown() = 0;
+ entity_inst_t get_myinst() { return entity_inst_t(_myname, get_myaddr()); }
- // callbacks
- static void do_callbacks();
-
- void queue_callback(Context *c);
- void queue_callbacks(list<Context*>& ls);
- virtual void callback_kick() = 0;
-
+ // hrmpf.
virtual int get_dispatch_queue_len() { return 0; };
// setup
bool is_ready() { return dispatcher != 0; }
// dispatch incoming messages
- virtual void dispatch(Message *m);
+ virtual void dispatch(Message *m) {
+ assert(dispatcher);
+ dispatcher->dispatch(m);
+ }
+
+ // shutdown
+ virtual int shutdown() = 0;
// send message
- virtual void prepare_dest(const entity_inst_t& inst) {}
- //virtual int send_message(Message *m, msg_addr_t dest, int port=0, int fromport=0) = 0;
- virtual int send_message(Message *m, msg_addr_t dest, entity_inst_t inst,
+ virtual void prepare_dest(const entity_addr_t& addr) {}
+ virtual int send_message(Message *m, entity_inst_t dest,
int port=0, int fromport=0) = 0;
-
// make a procedure call
- //virtual Message* sendrecv(Message *m, msg_addr_t dest, int port=0);
-
+ //virtual Message* sendrecv(Message *m, msg_name_t dest, int port=0);
- virtual void mark_down(msg_addr_t a, entity_inst_t& i) {}
- virtual void mark_up(msg_addr_t a, entity_inst_t& i) {}
- //virtual void reset(msg_addr_t a) { mark_down(a); mark_up(a); }
+ virtual void mark_down(entity_addr_t a) {}
};
<< " addr " << m->get_entity() << endl;
// pick id
- msg_addr_t entity = m->get_entity();
+ entity_name_t entity = m->get_entity();
if (entity.is_new()) {
// make up a new address!
void Rank::Namer::handle_started(Message *m)
{
- msg_addr_t who = m->get_source();
+ entity_name_t who = m->get_source();
dout(10) << "namer.handle_started from entity " << who << endl;
assert(rank.entity_unstarted.count(who));
void Rank::Namer::handle_unregister(Message *m)
{
- msg_addr_t who = m->get_source();
+ entity_name_t who = m->get_source();
dout(1) << "namer.handle_unregister entity " << who << endl;
rank.show_dir();
<< endl;
// search for entities on this instance
- list<msg_addr_t> rm;
- for (hash_map<msg_addr_t,entity_inst_t>::iterator i = rank.entity_map.begin();
+ list<entity_name_t> rm;
+ for (hash_map<entity_name_t,entity_inst_t>::iterator i = rank.entity_map.begin();
i != rank.entity_map.end();
i++) {
if (i->second != m->get_inst()) continue;
rm.push_back(i->first);
}
- for (list<msg_addr_t>::iterator i = rm.begin();
+ for (list<entity_name_t>::iterator i = rm.begin();
i != rm.end();
i++) {
dout(10) << "namer.handle_failure inst " << m->get_inst()
// FIXME: possible race before i reclaim lock here?
Dispatcher *dis = 0;
- msg_addr_t dis_dest;
+ entity_name_t dis_dest;
list<Message*> lost;
my_inst.rank = my_rank;
// create my rank
- msg_addr_t raddr = MSG_ADDR_RANK(my_rank);
+ entity_name_t raddr = MSG_ADDR_RANK(my_rank);
entity_map[raddr] = my_inst;
entity_unstarted.insert(raddr);
local[raddr] = messenger = new EntityMessenger(raddr);
void Rank::start_namer()
{
// create namer0
- msg_addr_t naddr = MSG_ADDR_NAMER(0);
+ entity_name_t naddr = MSG_ADDR_NAMER(0);
entity_map[naddr] = my_inst;
local[naddr] = new EntityMessenger(naddr);
namer = new Namer(local[naddr]);
{
dout(10) << "show_dir ---" << endl;
- for (hash_map<msg_addr_t, entity_inst_t>::iterator i = entity_map.begin();
+ for (hash_map<entity_name_t, entity_inst_t>::iterator i = entity_map.begin();
i != entity_map.end();
i++) {
if (local.count(i->first)) {
/* lookup
* NOTE: assumes directory.lock held
*/
-void Rank::lookup(msg_addr_t addr)
+void Rank::lookup(entity_name_t addr)
{
dout(10) << "lookup " << addr << endl;
assert(lock.is_locked());
/* register_entity
*/
-Rank::EntityMessenger *Rank::register_entity(msg_addr_t addr)
+Rank::EntityMessenger *Rank::register_entity(entity_name_t addr)
{
dout(10) << "register_entity " << addr << endl;
lock.Lock();
}
-void Rank::prepare_dest(msg_addr_t dest)
+void Rank::prepare_dest(entity_name_t dest)
{
lock.Lock();
void Rank::submit_message(Message *m, const entity_inst_t& dest_inst)
{
- const msg_addr_t dest = m->get_dest();
+ const entity_name_t dest = m->get_dest();
// lookup
EntityMessenger *entity = 0;
void Rank::submit_message(Message *m)
{
- const msg_addr_t dest = m->get_dest();
+ const entity_name_t dest = m->get_dest();
// lookup
EntityMessenger *entity = 0;
list<Message*> waiting;
dout(10) << "got lookup reply" << endl;
- for (map<msg_addr_t,entity_inst_t>::iterator it = m->entity_map.begin();
+ for (map<entity_name_t,entity_inst_t>::iterator it = m->entity_map.begin();
it != m->entity_map.end();
it++) {
dout(10) << "lookup got " << it->first << " at " << it->second << endl;
- msg_addr_t addr = it->first;
+ entity_name_t addr = it->first;
entity_inst_t inst = it->second;
if (down.count(addr)) {
* EntityMessenger
*/
-Rank::EntityMessenger::EntityMessenger(msg_addr_t myaddr) :
+Rank::EntityMessenger::EntityMessenger(entity_name_t myaddr) :
Messenger(myaddr),
stop(false),
dispatch_thread(this)
}
-void Rank::EntityMessenger::prepare_send_message(msg_addr_t dest)
+void Rank::EntityMessenger::prepare_send_message(entity_name_t dest)
{
rank.prepare_dest(dest);
}
-int Rank::EntityMessenger::send_message(Message *m, msg_addr_t dest, const entity_inst_t& inst)
+int Rank::EntityMessenger::send_message(Message *m, entity_name_t dest, const entity_inst_t& inst)
{
// set envelope
m->set_source(get_myaddr(), 0);
}
-int Rank::EntityMessenger::send_message(Message *m, msg_addr_t dest, int port, int fromport)
+int Rank::EntityMessenger::send_message(Message *m, entity_name_t dest, int port, int fromport)
{
// set envelope
m->set_source(get_myaddr(), fromport);
}
-void Rank::EntityMessenger::mark_down(msg_addr_t a, entity_inst_t& i)
+void Rank::EntityMessenger::mark_down(entity_name_t a, entity_inst_t& i)
{
assert(a != get_myaddr());
rank.mark_down(a,i);
}
-void Rank::mark_down(msg_addr_t a, entity_inst_t& inst)
+void Rank::mark_down(entity_name_t a, entity_inst_t& inst)
{
if (my_rank == 0) return; // ugh.. rank0 already handles this stuff in the namer
lock.Lock();
lock.Unlock();
}
-void Rank::EntityMessenger::mark_up(msg_addr_t a, entity_inst_t& i)
+void Rank::EntityMessenger::mark_up(entity_name_t a, entity_inst_t& i)
{
assert(a != get_myaddr());
rank.mark_up(a, i);
}
-void Rank::mark_up(msg_addr_t a, entity_inst_t& i)
+void Rank::mark_up(entity_name_t a, entity_inst_t& i)
{
if (my_rank == 0) return;
lock.Lock();
int nrank;
int nclient, nmds, nosd, nmon;
- map<msg_addr_t, list<Message*> > waiting;
+ map<entity_name_t, list<Message*> > waiting;
Namer(EntityMessenger *msgr);
~Namer();
bool done;
int sd;
- set<msg_addr_t> entities;
+ set<entity_name_t> entities;
list<Message*> q;
Mutex lock;
}
public:
- EntityMessenger(msg_addr_t myaddr);
+ EntityMessenger(entity_name_t myaddr);
~EntityMessenger();
void ready();
virtual void callback_kick() {}
virtual int shutdown();
- virtual void prepare_send_message(msg_addr_t dest);
- virtual int send_message(Message *m, msg_addr_t dest, int port=0, int fromport=0);
- virtual int send_message(Message *m, msg_addr_t dest, const entity_inst_t& inst);
+ virtual void prepare_send_message(entity_name_t dest);
+ virtual int send_message(Message *m, entity_name_t dest, int port=0, int fromport=0);
+ virtual int send_message(Message *m, entity_name_t dest, const entity_inst_t& inst);
- virtual void mark_down(msg_addr_t a, entity_inst_t& i);
- virtual void mark_up(msg_addr_t a, entity_inst_t& i);
+ virtual void mark_down(entity_name_t a, entity_inst_t& i);
+ virtual void mark_up(entity_name_t a, entity_inst_t& i);
//virtual void reset(msg_addr_t a);
};
bool single_dispatch_stop;
list<Message*> single_dispatch_queue;
- map<msg_addr_t, list<Message*> > waiting_for_ready;
+ map<entity_name_t, list<Message*> > waiting_for_ready;
void single_dispatcher_entry();
void _submit_single_dispatch(Message *m);
entity_inst_t my_inst;
// lookup
- hash_map<msg_addr_t, entity_inst_t> entity_map;
- hash_set<msg_addr_t> entity_unstarted;
+ hash_map<entity_name_t, entity_inst_t> entity_map;
+ hash_set<entity_name_t> entity_unstarted;
- map<msg_addr_t, list<Message*> > waiting_for_lookup;
- set<msg_addr_t> looking_up;
+ map<entity_name_t, list<Message*> > waiting_for_lookup;
+ set<entity_name_t> looking_up;
- hash_set<msg_addr_t> down;
+ hash_set<entity_name_t> down;
// register
map<int, Cond* > waiting_for_register_cond;
- map<int, msg_addr_t > waiting_for_register_result;
+ map<int, entity_name_t > waiting_for_register_result;
// local
- map<msg_addr_t, EntityMessenger*> local;
+ map<entity_name_t, EntityMessenger*> local;
// remote
hash_map<int, Sender*> rank_sender;
void show_dir();
- void lookup(msg_addr_t addr);
+ void lookup(entity_name_t addr);
void dispatch(Message *m);
void handle_connect_ack(class MNSConnectAck *m);
Sender *connect_rank(const entity_inst_t& inst);
- void mark_down(msg_addr_t addr, entity_inst_t& i);
- void mark_up(msg_addr_t addr, entity_inst_t& i);
+ void mark_down(entity_name_t addr, entity_inst_t& i);
+ void mark_up(entity_name_t addr, entity_inst_t& i);
tcpaddr_t get_listen_addr() { return accepter.listen_addr; }
int start_rank();
void wait();
- EntityMessenger *register_entity(msg_addr_t addr);
+ EntityMessenger *register_entity(entity_name_t addr);
void unregister_entity(EntityMessenger *ms);
void submit_message(Message *m, const entity_inst_t& inst);
- void prepare_dest(msg_addr_t dest);
+ void prepare_dest(entity_name_t dest);
void submit_message(Message *m);
void submit_messages(list<Message*>& ls);
// create a new messenger
- EntityMessenger *new_entity(msg_addr_t addr);
+ EntityMessenger *new_entity(entity_name_t addr);
} ;
<< " addr " << m->get_entity() << endl;
// pick id
- msg_addr_t entity = m->get_entity();
+ entity_name_t entity = m->get_entity();
if (entity.is_new()) {
// make up a new address!
void Rank::Namer::handle_started(Message *m)
{
- msg_addr_t who = m->get_source();
+ entity_name_t who = m->get_source();
dout(10) << "namer.handle_started from entity " << who << endl;
assert(rank.entity_unstarted.count(who));
void Rank::Namer::handle_unregister(Message *m)
{
- msg_addr_t who = m->get_source();
+ entity_name_t who = m->get_source();
dout(1) << "namer.handle_unregister entity " << who << endl;
rank.show_dir();
<< endl;
// search for entities on this instance
- list<msg_addr_t> rm;
- for (hash_map<msg_addr_t,entity_inst_t>::iterator i = rank.entity_map.begin();
+ list<entity_name_t> rm;
+ for (hash_map<entity_name_t,entity_inst_t>::iterator i = rank.entity_map.begin();
i != rank.entity_map.end();
i++) {
if (i->second != m->get_inst()) continue;
rm.push_back(i->first);
}
- for (list<msg_addr_t>::iterator i = rm.begin();
+ for (list<entity_name_t>::iterator i = rm.begin();
i != rm.end();
i++) {
dout(10) << "namer.handle_failure inst " << m->get_inst()
// what do i do about reader()? FIXME
// sort my messages by (source) dispatcher, dest.
- map<Dispatcher*, map<msg_addr_t, list<Message*> > > by_dis;
+ map<Dispatcher*, map<entity_name_t, list<Message*> > > by_dis;
lock.Lock();
{
// include out at front of queue
lock.Unlock();
// report failure(s) to dispatcher(s)
- for (map<Dispatcher*, map<msg_addr_t, list<Message*> > >::iterator i = by_dis.begin();
+ for (map<Dispatcher*, map<entity_name_t, list<Message*> > >::iterator i = by_dis.begin();
i != by_dis.end();
++i)
- for (map<msg_addr_t, list<Message*> >::iterator j = i->second.begin();
+ for (map<entity_name_t, list<Message*> >::iterator j = i->second.begin();
j != i->second.end();
++j)
for (list<Message*>::iterator k = j->second.begin();
messenger->set_dispatcher(this);
} else {
// create my rank
- msg_addr_t raddr = MSG_ADDR_RANK(my_rank);
+ entity_name_t raddr = MSG_ADDR_RANK(my_rank);
entity_map[raddr] = my_inst;
entity_unstarted.insert(raddr);
local[raddr] = messenger = new EntityMessenger(raddr);
void Rank::start_namer()
{
// create namer0
- msg_addr_t naddr = MSG_ADDR_NAMER(0);
+ entity_name_t naddr = MSG_ADDR_NAMER(0);
entity_map[naddr] = my_inst;
local[naddr] = new EntityMessenger(naddr);
namer = new Namer(local[naddr]);
{
dout(10) << "show_dir ---" << endl;
- for (hash_map<msg_addr_t, entity_inst_t>::iterator i = entity_map.begin();
+ for (hash_map<entity_name_t, entity_inst_t>::iterator i = entity_map.begin();
i != entity_map.end();
i++) {
if (local.count(i->first)) {
/* lookup
* NOTE: assumes directory.lock held
*/
-void Rank::lookup(msg_addr_t addr)
+void Rank::lookup(entity_name_t addr)
{
dout(10) << "lookup " << addr << endl;
assert(lock.is_locked());
/* register_entity
*/
-Rank::EntityMessenger *Rank::register_entity(msg_addr_t addr)
+Rank::EntityMessenger *Rank::register_entity(entity_name_t addr)
{
dout(10) << "register_entity " << addr << endl;
lock.Lock();
void Rank::submit_message(Message *m, const entity_inst_t& dest_inst)
{
- const msg_addr_t dest = m->get_dest();
+ const entity_name_t dest = m->get_dest();
// lookup
EntityMessenger *entity = 0;
void Rank::submit_message(Message *m)
{
- const msg_addr_t dest = m->get_dest();
+ const entity_name_t dest = m->get_dest();
// lookup
EntityMessenger *entity = 0;
list<Message*> waiting;
dout(10) << "got lookup reply" << endl;
- for (map<msg_addr_t,entity_inst_t>::iterator it = m->entity_map.begin();
+ for (map<entity_name_t,entity_inst_t>::iterator it = m->entity_map.begin();
it != m->entity_map.end();
it++) {
dout(10) << "lookup got " << it->first << " at " << it->second << endl;
- msg_addr_t addr = it->first;
+ entity_name_t addr = it->first;
entity_inst_t inst = it->second;
if (entity_map.count(addr) &&
* EntityMessenger
*/
-Rank::EntityMessenger::EntityMessenger(msg_addr_t myaddr) :
+Rank::EntityMessenger::EntityMessenger(entity_name_t myaddr) :
Messenger(myaddr),
stop(false),
dispatch_thread(this)
rank.lock.Unlock();
}
-int Rank::EntityMessenger::send_message(Message *m, msg_addr_t dest, const entity_inst_t& inst,
+int Rank::EntityMessenger::send_message(Message *m, entity_name_t dest, const entity_inst_t& inst,
int port, int fromport)
{
// set envelope
}
-int Rank::EntityMessenger::send_message(Message *m, msg_addr_t dest, int port, int fromport)
+int Rank::EntityMessenger::send_message(Message *m, entity_name_t dest, int port, int fromport)
{
// set envelope
m->set_source(get_myaddr(), fromport);
}
-void Rank::EntityMessenger::mark_down(msg_addr_t a, entity_inst_t& i)
+void Rank::EntityMessenger::mark_down(entity_name_t a, entity_inst_t& i)
{
assert(a != get_myaddr());
rank.mark_down(a,i);
}
-void Rank::mark_down(msg_addr_t a, entity_inst_t& inst)
+void Rank::mark_down(entity_name_t a, entity_inst_t& inst)
{
//if (my_rank == 0) return; // ugh.. rank0 already handles this stuff in the namer
lock.Lock();
lock.Unlock();
}
-void Rank::EntityMessenger::mark_up(msg_addr_t a, entity_inst_t& i)
+void Rank::EntityMessenger::mark_up(entity_name_t a, entity_inst_t& i)
{
assert(a != get_myaddr());
rank.mark_up(a, i);
}
-void Rank::mark_up(msg_addr_t a, entity_inst_t& i)
+void Rank::mark_up(entity_name_t a, entity_inst_t& i)
{
if (my_rank == 0) return;
lock.Lock();
int nrank;
int nclient, nmds, nosd, nmon;
- map<msg_addr_t, list<Message*> > waiting;
+ map<entity_name_t, list<Message*> > waiting;
Namer(EntityMessenger *msgr);
~Namer();
}
public:
- EntityMessenger(msg_addr_t myaddr);
+ EntityMessenger(entity_name_t myaddr);
~EntityMessenger();
void ready();
virtual void callback_kick() {}
virtual int shutdown();
virtual void prepare_dest(const entity_inst_t& inst);
- virtual int send_message(Message *m, msg_addr_t dest, int port=0, int fromport=0);
- virtual int send_message(Message *m, msg_addr_t dest, const entity_inst_t& inst,
+ virtual int send_message(Message *m, entity_name_t dest, int port=0, int fromport=0);
+ virtual int send_message(Message *m, entity_name_t dest, const entity_inst_t& inst,
int port=0, int fromport=0);
- virtual void mark_down(msg_addr_t a, entity_inst_t& i);
- virtual void mark_up(msg_addr_t a, entity_inst_t& i);
+ virtual void mark_down(entity_name_t a, entity_inst_t& i);
+ virtual void mark_up(entity_name_t a, entity_inst_t& i);
//virtual void reset(msg_addr_t a);
};
bool single_dispatch_stop;
list<Message*> single_dispatch_queue;
- map<msg_addr_t, list<Message*> > waiting_for_ready;
+ map<entity_name_t, list<Message*> > waiting_for_ready;
void single_dispatcher_entry();
void _submit_single_dispatch(Message *m);
entity_inst_t my_inst;
// lookup
- hash_map<msg_addr_t, entity_inst_t> entity_map;
- hash_set<msg_addr_t> entity_unstarted;
+ hash_map<entity_name_t, entity_inst_t> entity_map;
+ hash_set<entity_name_t> entity_unstarted;
- map<msg_addr_t, list<Message*> > waiting_for_lookup;
- set<msg_addr_t> looking_up;
+ map<entity_name_t, list<Message*> > waiting_for_lookup;
+ set<entity_name_t> looking_up;
// register
map<int, Cond* > waiting_for_register_cond;
- map<int, msg_addr_t > waiting_for_register_result;
+ map<int, entity_name_t > waiting_for_register_result;
// local
- map<msg_addr_t, EntityMessenger*> local;
+ map<entity_name_t, EntityMessenger*> local;
// remote
hash_map<int, Pipe*> rank_pipe;
void show_dir();
- void lookup(msg_addr_t addr);
+ void lookup(entity_name_t addr);
void dispatch(Message *m);
void handle_connect_ack(class MNSConnectAck *m);
Pipe *connect_rank(const entity_inst_t& inst);
- void mark_down(msg_addr_t addr, entity_inst_t& i);
- void mark_up(msg_addr_t addr, entity_inst_t& i);
+ void mark_down(entity_name_t addr, entity_inst_t& i);
+ void mark_up(entity_name_t addr, entity_inst_t& i);
tcpaddr_t get_listen_addr() { return accepter.listen_addr; }
int start_rank();
void wait();
- EntityMessenger *register_entity(msg_addr_t addr);
+ EntityMessenger *register_entity(entity_name_t addr);
void unregister_entity(EntityMessenger *ms);
void submit_message(Message *m, const entity_inst_t& inst);
void submit_messages(list<Message*>& ls);
// create a new messenger
- EntityMessenger *new_entity(msg_addr_t addr);
+ EntityMessenger *new_entity(entity_name_t addr);
} ;
class SerialMessenger : public Dispatcher {
public:
virtual void dispatch(Message *m) = 0; // i receive my messages here
- virtual void send(Message *m, msg_addr_t dest, int port=0, int fromport=0) = 0; // doesn't block
- virtual Message *sendrecv(Message *m, msg_addr_t dest, int port=0, int fromport=0) = 0; // blocks for matching reply
+ virtual void send(Message *m, entity_name_t dest, int port=0, int fromport=0) = 0; // doesn't block
+ virtual Message *sendrecv(Message *m, entity_name_t dest, int port=0, int fromport=0) = 0; // blocks for matching reply
};
#endif
#include "config.h"
#include "messages/MGenericMessage.h"
-#include "messages/MNSConnect.h"
-#include "messages/MNSConnectAck.h"
-#include "messages/MNSRegister.h"
-#include "messages/MNSRegisterAck.h"
-#include "messages/MNSLookup.h"
-#include "messages/MNSLookupReply.h"
-#include "messages/MNSFailure.h"
//#include "messages/MFailure.h"
#undef dout
-#define dout(l) if (l<=g_conf.debug_ms) cout << g_clock.now() << " -- " << rank.my_inst.addr << " "
-#define derr(l) if (l<=g_conf.debug_ms) cerr << g_clock.now() << " -- " << rank.my_inst.addr << " "
+#define dout(l) if (l<=g_conf.debug_ms) cout << g_clock.now() << " -- " << rank.my_addr << " "
+#define derr(l) if (l<=g_conf.debug_ms) cerr << g_clock.now() << " -- " << rank.my_addr << " "
// bind to a socket
dout(10) << "accepter.start binding to listen " << endl;
+ // use whatever user specified..
+ g_my_addr.make_addr(rank.listen_addr);
+
/* socket creation */
listen_sd = socket(AF_INET,SOCK_STREAM,0);
assert(listen_sd > 0);
// listen!
rc = ::listen(listen_sd, 1000);
assert(rc >= 0);
+ //dout(10) << "accepter.start listening on " << myport << endl;
- // my address is...
+ // my address is... HELP HELP HELP!
char host[100];
bzero(host, 100);
gethostname(host, 100);
struct hostent *myhostname = gethostbyname( host );
- struct sockaddr_in my_addr;
- memset(&my_addr, 0, sizeof(my_addr));
-
- my_addr.sin_family = myhostname->h_addrtype;
- memcpy((char *) &my_addr.sin_addr.s_addr,
- myhostname->h_addr_list[0],
- myhostname->h_length);
- my_addr.sin_port = htons(myport);
+ // figure out my_addr
+ if (g_my_addr.port > 0) {
+ // user specified it, easy.
+ rank.my_addr = g_my_addr;
+ } else {
+ // try to figure out what ip i can be reached out
+ memset(&rank.listen_addr, 0, sizeof(rank.listen_addr));
+
+ // look up my hostname. blech! this sucks.
+ rank.listen_addr.sin_family = myhostname->h_addrtype;
+ memcpy((char *) &rank.listen_addr.sin_addr.s_addr,
+ myhostname->h_addr_list[0],
+ myhostname->h_length);
+ rank.listen_addr.sin_port = htons(myport);
+ rank.my_addr.set_addr(rank.listen_addr);
+ }
- rank.listen_addr = my_addr;
+ // set a nonce
+ rank.my_addr.nonce = getpid(); // FIXME: pid might not be best choice here.
- dout(10) << "accepter.start listen addr is " << rank.listen_addr << endl;
+ dout(10) << "accepter.start my addr is " << rank.my_addr << endl;
// set up signal handler
signal(SIGINT, simplemessenger_sigint);
dout(10) << "accepted incoming on sd " << sd << endl;
rank.lock.Lock();
- Pipe *p = new Pipe(sd);
- rank.pipes.insert(p);
+ if (!rank.local.empty()) {
+ Pipe *p = new Pipe(sd);
+ rank.pipes.insert(p);
+ }
rank.lock.Unlock();
} else {
dout(10) << "no incoming connection?" << endl;
// my creater gave me sd via accept()
// announce myself.
- int rc = tcp_write(sd, (char*)&rank.my_inst, sizeof(rank.my_inst));
+ int rc = tcp_write(sd, (char*)&rank.my_addr, sizeof(rank.my_addr));
if (rc < 0) {
::close(sd);
done = true;
}
// identify peer
- rc = tcp_read(sd, (char*)&peer_inst, sizeof(peer_inst));
+ rc = tcp_read(sd, (char*)&peer_addr, sizeof(peer_addr));
if (rc < 0) {
dout(10) << "pipe(? " << this << ").accept couldn't read peer inst" << endl;
::close(sd);
writer_thread.create();
// register pipe.
- if (peer_inst.rank >= 0) {
- rank.lock.Lock();
- {
- if (rank.rank_pipe.count(peer_inst.rank) == 0) {
- // install a pipe!
- dout(10) << "pipe(" << peer_inst << ' ' << this << ").accept peer is " << peer_inst << endl;
- rank.rank_pipe[peer_inst.rank] = this;
+ rank.lock.Lock();
+ {
+ if (rank.rank_pipe.count(peer_addr) == 0) {
+ // install a pipe!
+ dout(10) << "pipe(" << peer_addr << ' ' << this << ").accept peer is " << peer_addr << endl;
+ rank.rank_pipe[peer_addr] = this;
+ } else {
+ // low ranks' Pipes "win"
+ if (peer_addr < rank.my_addr) {
+ dout(10) << "pipe(" << peer_addr << ' ' << this << ").accept peer is " << peer_addr
+ << ", already had pipe, but switching to this new one" << endl;
+ // switch to this new Pipe
+ rank.rank_pipe[peer_addr]->close(); // close old one
+ rank.rank_pipe[peer_addr] = this;
} else {
- // low ranks' Pipes "win"
- if (peer_inst.rank < rank.my_inst.rank ||
- rank.my_inst.rank < 0) {
- dout(10) << "pipe(" << peer_inst << ' ' << this << ").accept peer is " << peer_inst
- << ", already had pipe, but switching to this new one" << endl;
- // switch to this new Pipe
- rank.rank_pipe[peer_inst.rank]->close(); // close old one
- rank.rank_pipe[peer_inst.rank] = this;
- } else {
- dout(10) << "pipe(" << peer_inst << ' ' << this << ").accept peer is " << peer_inst
- << ", already had pipe, sticking with it" << endl;
- }
+ dout(10) << "pipe(" << peer_addr << ' ' << this << ").accept peer is " << peer_addr
+ << ", already had pipe, sticking with it" << endl;
}
}
- rank.lock.Unlock();
- } else {
- dout(10) << "pipe(" << peer_inst << ' ' << this << ").accept peer is unranked " << peer_inst << endl;
}
+ rank.lock.Unlock();
return 0; // success.
}
int Rank::Pipe::connect()
{
- dout(10) << "pipe(" << peer_inst << ' ' << this << ").connect" << endl;
+ dout(10) << "pipe(" << peer_addr << ' ' << this << ").connect" << endl;
// create socket?
sd = socket(AF_INET,SOCK_STREAM,0);
assert(rc>=0);
// connect!
- rc = ::connect(sd, (struct sockaddr*)&peer_inst.addr, sizeof(myAddr));
+ tcpaddr_t tcpaddr;
+ peer_addr.make_addr(tcpaddr);
+ rc = ::connect(sd, (sockaddr*)&tcpaddr, sizeof(myAddr));
if (rc < 0) {
- dout(10) << "connect error " << peer_inst
+ dout(10) << "connect error " << peer_addr
<< ", " << errno << ": " << strerror(errno) << endl;
return rc;
}
- // identify peer ..... FIXME
- entity_inst_t inst;
- rc = tcp_read(sd, (char*)&inst, sizeof(inst));
- if (inst.rank < 0)
- inst = peer_inst; // i know better than they do.
- if (peer_inst != inst && inst.rank > 0) {
- derr(0) << "pipe(" << peer_inst << ' ' << this << ").connect peer is " << inst << ", wtf" << endl;
+ // identify peer
+ entity_addr_t paddr;
+ rc = tcp_read(sd, (char*)&paddr, sizeof(paddr));
+ if (!rc) { // bool
+ dout(10) << "pipe(" << peer_addr << ' ' << this << ").connect couldn't read peer addr" << endl;
+ return -1;
+ }
+ if (peer_addr != paddr) {
+ derr(0) << "pipe(" << peer_addr << ' ' << this << ").connect peer is " << paddr << ", wtf" << endl;
assert(0);
return -1;
}
// identify myself
- rc = tcp_write(sd, (char*)&rank.my_inst, sizeof(rank.my_inst));
+ rc = tcp_write(sd, (char*)&rank.my_addr, sizeof(rank.my_addr));
if (rc < 0)
return -1;
// register pipe
rank.lock.Lock();
{
- if (rank.rank_pipe.count(peer_inst.rank) == 0) {
- dout(10) << "pipe(" << peer_inst << ' ' << this << ").connect registering pipe" << endl;
- rank.rank_pipe[peer_inst.rank] = this;
+ if (rank.rank_pipe.count(peer_addr) == 0) {
+ dout(10) << "pipe(" << peer_addr << ' ' << this << ").connect registering pipe" << endl;
+ rank.rank_pipe[peer_addr] = this;
} else {
// this is normal.
- dout(10) << "pipe(" << peer_inst << ' ' << this << ").connect pipe already registered." << endl;
+ dout(10) << "pipe(" << peer_addr << ' ' << this << ").connect pipe already registered." << endl;
}
}
rank.lock.Unlock();
void Rank::Pipe::close()
{
- if (sent_close) {
- dout(10) << "pipe(" << peer_inst << ' ' << this << ").close already closing" << endl;
- return;
- }
- dout(10) << "pipe(" << peer_inst << ' ' << this << ").close" << endl;
+ dout(10) << "pipe(" << peer_addr << ' ' << this << ").close" << endl;
// unreg ourselves
rank.lock.Lock();
{
- if (rank.rank_pipe.count(peer_inst.rank) &&
- rank.rank_pipe[peer_inst.rank] == this) {
- dout(10) << "pipe(" << peer_inst << ' ' << this << ").close unregistering pipe" << endl;
- rank.rank_pipe.erase(peer_inst.rank);
+ if (rank.rank_pipe.count(peer_addr) &&
+ rank.rank_pipe[peer_addr] == this) {
+ dout(10) << "pipe(" << peer_addr << ' ' << this
+ << ").close unregistering pipe" << endl;
+ rank.rank_pipe.erase(peer_addr);
}
}
rank.lock.Unlock();
- // queue close message.
- if (socket_error) {
- dout(10) << "pipe(" << peer_inst << ' ' << this << ").close not queueing MSG_CLOSE, socket error" << endl;
+ // queue close message?
+ if (!need_to_send_close) {
+ dout(10) << "pipe(" << peer_addr << ' ' << this
+ << ").close already closing/closed" << endl;
+ return;
+ }
+
+ if (!writer_running) {
+ dout(10) << "pipe(" << peer_addr << ' ' << this
+ << ").close not queueing MSG_CLOSE, no writer running" << endl;
} else {
- dout(10) << "pipe(" << peer_inst << ' ' << this << ").close queueing MSG_CLOSE" << endl;
+ dout(10) << "pipe(" << peer_addr << ' ' << this
+ << ").close queueing MSG_CLOSE" << endl;
lock.Lock();
q.push_back(new MGenericMessage(MSG_CLOSE));
cond.Signal();
- sent_close = true;
+ need_to_send_close = false;
lock.Unlock();
}
}
if (!m || m->get_type() == 0) {
if (m) {
delete m;
- dout(10) << "pipe(" << peer_inst << ' ' << this << ").reader read MSG_CLOSE message" << endl;
+ dout(10) << "pipe(" << peer_addr << ' ' << this << ").reader read MSG_CLOSE message" << endl;
+ need_to_send_close = false;
} else {
- derr(10) << "pipe(" << peer_inst << ' ' << this << ").reader read null message" << endl;
+ derr(10) << "pipe(" << peer_addr << ' ' << this << ").reader read null message" << endl;
}
- if (!sent_close)
- close();
+ close();
done = true;
cond.Signal(); // wake up writer too.
break;
}
- dout(10) << "pipe(" << peer_inst << ' ' << this << ").reader got message for " << m->get_dest() << endl;
+ dout(10) << "pipe(" << peer_addr << ' ' << this << ").reader got message for " << m->get_dest() << endl;
EntityMessenger *entity = 0;
rank.lock.Lock();
{
- if (rank.entity_map.count(m->get_source()) &&
- rank.entity_map[m->get_source()] > m->get_source_inst()) {
- derr(0) << "pipe(" << peer_inst << ' ' << this << ").reader source " << m->get_source()
- << " inst " << m->get_source_inst()
- << " > " << rank.entity_map[m->get_source()]
- << ", WATCH OUT " << *m << endl;
- assert(0);
- }
-
if (g_conf.ms_single_dispatch) {
// submit to single dispatch queue
rank._submit_single_dispatch(m);
} else {
entity = rank.find_unnamed(m->get_dest());
if (!entity) {
- derr(0) << "pipe(" << peer_inst << ' ' << this << ").reader got message " << *m << " for " << m->get_dest() << ", which isn't local" << endl;
- assert(0); // FIXME do this differently
+ if (rank.stopped.count(m->get_dest())) {
+ // ignore it
+ } else {
+ derr(0) << "pipe(" << peer_addr << ' ' << this << ").reader got message " << *m << " for " << m->get_dest() << ", which isn't local" << endl;
+ assert(0); // FIXME do this differently
+ }
}
}
}
lock.Unlock();
if (reap) {
- dout(20) << "pipe(" << peer_inst << ' ' << this << ").reader queueing for reap" << endl;
+ dout(20) << "pipe(" << peer_addr << ' ' << this << ").reader queueing for reap" << endl;
::close(sd);
rank.lock.Lock();
{
if (!server) {
int rc = connect();
if (rc < 0) {
- derr(1) << "pipe(" << peer_inst << ' ' << this << ").writer error connecting, "
+ derr(1) << "pipe(" << peer_addr << ' ' << this << ").writer error connecting, "
<< errno << ": " << strerror(errno)
<< endl;
done = true;
while (!q.empty() || !done) {
if (!q.empty()) {
- dout(20) << "pipe(" << peer_inst << ' ' << this << ").writer grabbing message(s)" << endl;
+ dout(20) << "pipe(" << peer_addr << ' ' << this << ").writer grabbing message(s)" << endl;
// grab outgoing list
list<Message*> out;
Message *m = out.front();
out.pop_front();
- dout(20) << "pipe(" << peer_inst << ' ' << this << ").writer sending " << *m << endl;
+ dout(20) << "pipe(" << peer_addr << ' ' << this << ").writer sending " << *m << endl;
// stamp.
- m->set_source_inst(rank.my_inst);
+ m->set_source_addr(rank.my_addr);
// marshall
if (m->empty_payload())
if (write_message(m) < 0) {
// failed!
- derr(1) << "pipe(" << peer_inst << ' ' << this << ").writer error sending " << *m << " to " << m->get_dest()
+ derr(1) << "pipe(" << peer_addr << ' ' << this << ").writer error sending " << *m << " to " << m->get_dest()
<< ", " << errno << ": " << strerror(errno)
<< endl;
out.push_front(m);
}
// wait
- dout(20) << "pipe(" << peer_inst << ' ' << this << ").writer sleeping" << endl;
+ dout(20) << "pipe(" << peer_addr << ' ' << this << ").writer sleeping" << endl;
cond.Wait(lock);
}
lock.Unlock();
- dout(20) << "pipe(" << peer_inst << ' ' << this << ").writer finishing" << endl;
+ dout(20) << "pipe(" << peer_addr << ' ' << this << ").writer finishing" << endl;
// reap?
bool reap = false;
lock.Unlock();
if (reap) {
- dout(20) << "pipe(" << peer_inst << ' ' << this << ").writer queueing for reap" << endl;
+ dout(20) << "pipe(" << peer_addr << ' ' << this << ").writer queueing for reap" << endl;
::close(sd);
rank.lock.Lock();
{
msg_envelope_t env;
if (!tcp_read( sd, (char*)&env, sizeof(env) )) {
- socket_error = true;
+ need_to_send_close = false;
return 0;
}
- dout(20) << "pipe(" << peer_inst << ' ' << this << ").reader got envelope type=" << env.type
- << " src " << env.source << " dst " << env.dest
+ dout(20) << "pipe(" << peer_addr << ' ' << this << ").reader got envelope type=" << env.type
+ << " src " << env.src << " dst " << env.dst
<< " nchunks=" << env.nchunks
<< endl;
for (int i=0; i<env.nchunks; i++) {
int size;
if (!tcp_read( sd, (char*)&size, sizeof(size) )) {
- socket_error = true;
+ need_to_send_close = false;
return 0;
}
bufferptr bp(size);
if (!tcp_read( sd, bp.c_str(), size )) {
- socket_error = true;
+ need_to_send_close = false;
return 0;
}
blist.push_back(bp);
- dout(20) << "pipe(" << peer_inst << ' ' << this << ").reader got frag " << i << " of " << env.nchunks
+ dout(20) << "pipe(" << peer_addr << ' ' << this << ").reader got frag " << i << " of " << env.nchunks
<< " len " << bp.length() << endl;
}
size_t s = blist.length();
Message *m = decode_message(env, blist);
- dout(20) << "pipe(" << peer_inst << ' ' << this << ").reader got " << s << " byte message from "
+ dout(20) << "pipe(" << peer_addr << ' ' << this << ").reader got " << s << " byte message from "
<< m->get_source() << endl;
return m;
env->nchunks = 1;
#endif
- dout(20) << "pipe(" << peer_inst << ' ' << this << ").writer sending " << m << " " << *m
+ dout(20) << "pipe(" << peer_addr << ' ' << this << ").writer sending " << m << " " << *m
<< " to " << m->get_dest()
<< endl;
// send envelope
int r = tcp_write( sd, (char*)env, sizeof(*env) );
if (r < 0) {
- derr(1) << "pipe(" << peer_inst << ' ' << this << ").writer error sending envelope for " << *m
+ derr(1) << "pipe(" << peer_addr << ' ' << this << ").writer error sending envelope for " << *m
<< " to " << m->get_dest() << endl;
- socket_error = true;
+ need_to_send_close = false;
return -1;
}
for (list<bufferptr>::const_iterator it = blist.buffers().begin();
it != blist.buffers().end();
it++) {
- dout(10) << "pipe(" << peer_inst << ' ' << this << ").writer tcp_sending frag " << i << " len " << (*it).length() << endl;
+ dout(10) << "pipe(" << peer_addr << ' ' << this << ").writer tcp_sending frag " << i << " len " << (*it).length() << endl;
int size = (*it).length();
r = tcp_write( sd, (char*)&size, sizeof(size) );
if (r < 0) {
- derr(10) << "pipe(" << peer_inst << ' ' << this << ").writer error sending chunk len for " << *m << " to " << m->get_dest() << endl;
- socket_error = true;
+ derr(10) << "pipe(" << peer_addr << ' ' << this << ").writer error sending chunk len for " << *m << " to " << m->get_dest() << endl;
+ need_to_send_close = false;
return -1;
}
r = tcp_write( sd, (*it).c_str(), size );
if (r < 0) {
- derr(10) << "pipe(" << peer_inst << ' ' << this << ").writer error sending data chunk for " << *m << " to " << m->get_dest() << endl;
- socket_error = true;
+ derr(10) << "pipe(" << peer_addr << ' ' << this << ").writer error sending data chunk for " << *m << " to " << m->get_dest() << endl;
+ need_to_send_close = false;
return -1;
}
i++;
int size = blist.length();
r = tcp_write( sd, (char*)&size, sizeof(size) );
if (r < 0) {
- derr(10) << "pipe(" << peer_inst << ' ' << this << ").writer error sending data len for " << *m << " to " << m->get_dest() << endl;
- socket_error = true;
+ derr(10) << "pipe(" << peer_addr << ' ' << this << ").writer error sending data len for " << *m << " to " << m->get_dest() << endl;
+ need_to_send_close = false;
return -1;
}
- dout(20) << "pipe(" << peer_inst << ' ' << this << ").writer data len is " << size << " in " << blist.buffers().size() << " buffers" << endl;
+ dout(20) << "pipe(" << peer_addr << ' ' << this << ").writer data len is " << size << " in " << blist.buffers().size() << " buffers" << endl;
for (list<bufferptr>::const_iterator it = blist.buffers().begin();
it != blist.buffers().end();
if ((*it).length() == 0) continue; // blank buffer.
r = tcp_write( sd, (char*)(*it).c_str(), (*it).length() );
if (r < 0) {
- derr(10) << "pipe(" << peer_inst << ' ' << this << ").writer error sending data megachunk for " << *m << " to " << m->get_dest() << " : len " << (*it).length() << endl;
- socket_error = true;
+ derr(10) << "pipe(" << peer_addr << ' ' << this << ").writer error sending data megachunk for " << *m << " to " << m->get_dest() << " : len " << (*it).length() << endl;
+ need_to_send_close = false;
return -1;
}
}
void Rank::Pipe::fail(list<Message*>& out)
{
- derr(10) << "pipe(" << peer_inst << ' ' << this << ").fail" << endl;
+ derr(10) << "pipe(" << peer_addr << ' ' << this << ").fail" << endl;
// FIXME: possible race before i reclaim lock here?
// deactivate myself
rank.lock.Lock();
{
- if (rank.rank_pipe.count(peer_inst.rank) &&
- rank.rank_pipe[peer_inst.rank] == this)
- rank.rank_pipe.erase(peer_inst.rank);
+ if (rank.rank_pipe.count(peer_addr) &&
+ rank.rank_pipe[peer_addr] == this)
+ rank.rank_pipe.erase(peer_addr);
}
rank.lock.Unlock();
// what do i do about reader()? FIXME
// sort my messages by (source) dispatcher, dest.
- map<Dispatcher*, map<msg_addr_t, list<Message*> > > by_dis;
+ map<Dispatcher*, map<entity_name_t, list<Message*> > > by_dis;
lock.Lock();
{
// include out at front of queue
Dispatcher *dis = mgr->get_dispatcher();
if (mgr->is_stopped()) {
// ignore.
- dout(1) << "pipe(" << peer_inst << ' ' << this << ").fail on " << *q.front() << ", dispatcher stopping, ignoring." << endl;
+ dout(1) << "pipe(" << peer_addr << ' ' << this << ").fail on " << *q.front() << ", dispatcher stopping, ignoring." << endl;
delete q.front();
} else {
by_dis[dis][q.front()->get_dest()].push_back(q.front());
lock.Unlock();
// report failure(s) to dispatcher(s)
- for (map<Dispatcher*, map<msg_addr_t, list<Message*> > >::iterator i = by_dis.begin();
+ for (map<Dispatcher*, map<entity_name_t, list<Message*> > >::iterator i = by_dis.begin();
i != by_dis.end();
++i)
- for (map<msg_addr_t, list<Message*> >::iterator j = i->second.begin();
+ for (map<entity_name_t, list<Message*> >::iterator j = i->second.begin();
j != i->second.end();
++j)
for (list<Message*>::iterator k = j->second.begin();
k != j->second.end();
++k) {
- derr(1) << "pipe(" << peer_inst << ' ' << this << ").fail on " << **k << " to " << j->first << " inst " << peer_inst << endl;
- i->first->ms_handle_failure(*k, j->first, peer_inst);
+ derr(1) << "pipe(" << peer_addr << ' ' << this << ").fail on " << **k << " to " << j->first << " inst " << peer_addr << endl;
+ i->first->ms_handle_failure(*k, j->first, peer_addr);
}
}
{
}
+/*
void Rank::set_listen_addr(tcpaddr_t& a)
{
dout(10) << "set_listen_addr " << a << endl;
memcpy((char*)&listen_addr.sin_addr.s_addr, (char*)&a.sin_addr.s_addr, 4);
listen_addr.sin_port = a.sin_port;
}
-
+*/
void Rank::_submit_single_dispatch(Message *m)
{
ls.pop_front();
dout(1) << m->get_dest()
- << " <-- " << m->get_source() << " " << m->get_source_inst()
+ << " <-- " << m->get_source_inst()
<< " ---- " << *m
<< " -- " << m
<< endl;
while (!pipe_reap_queue.empty()) {
Pipe *p = pipe_reap_queue.front();
- dout(10) << "reaper reaping pipe " << p->get_peer_inst() << endl;
+ dout(10) << "reaper reaping pipe " << p->get_peer_addr() << endl;
pipe_reap_queue.pop_front();
assert(pipes.count(p));
pipes.erase(p);
p->join();
- dout(10) << "reaper reaped pipe " << p->get_peer_inst() << endl;
+ dout(10) << "reaper reaped pipe " << p->get_peer_addr() << endl;
delete p;
}
}
lock.Lock();
- // my_inst
- my_inst.set_addr( listen_addr );
-
- dout(1) << "start_rank at " << my_inst << endl;
+ dout(1) << "start_rank at " << listen_addr << endl;
lock.Unlock();
return 0;
/* connect_rank
* NOTE: assumes rank.lock held.
*/
-Rank::Pipe *Rank::connect_rank(const entity_inst_t& inst)
+Rank::Pipe *Rank::connect_rank(const entity_addr_t& addr)
{
assert(rank.lock.is_locked());
- assert(inst != rank.my_inst);
+ assert(addr != rank.my_addr);
- dout(10) << "connect_rank to " << inst << endl;
+ dout(10) << "connect_rank to " << addr << endl;
// create pipe
- Pipe *pipe = new Pipe(inst);
- rank.rank_pipe[inst.rank] = pipe;
+ Pipe *pipe = new Pipe(addr);
+ rank.rank_pipe[addr] = pipe;
pipes.insert(pipe);
return pipe;
-void Rank::show_dir()
-{
- dout(10) << "show_dir ---" << endl;
-
- for (hash_map<msg_addr_t, entity_inst_t>::iterator i = entity_map.begin();
- i != entity_map.end();
- i++) {
- if (local.count(i->first)) {
- dout(10) << "show_dir entity_map " << i->first << " -> " << i->second << " local " << endl;
- } else {
- dout(10) << "show_dir entity_map " << i->first << " -> " << i->second << endl;
- }
- }
-}
-Rank::EntityMessenger *Rank::find_unnamed(msg_addr_t a)
+Rank::EntityMessenger *Rank::find_unnamed(entity_name_t a)
{
// find an unnamed local entity of the right type
- for (map<msg_addr_t, EntityMessenger*>::iterator p = local.begin();
+ for (map<entity_name_t, EntityMessenger*>::iterator p = local.begin();
p != local.end();
++p) {
if (p->first.type() == a.type() && p->first.is_new())
/* register_entity
*/
-Rank::EntityMessenger *Rank::register_entity(msg_addr_t addr)
+Rank::EntityMessenger *Rank::register_entity(entity_name_t name)
{
- dout(10) << "register_entity " << addr << endl;
+ dout(10) << "register_entity " << name << endl;
lock.Lock();
// create messenger
- EntityMessenger *msgr = new EntityMessenger(addr);
+ EntityMessenger *msgr = new EntityMessenger(name);
// add to directory
- entity_map[addr] = my_inst;
- local[addr] = msgr;
+ local[name] = msgr;
lock.Unlock();
return msgr;
void Rank::unregister_entity(EntityMessenger *msgr)
{
lock.Lock();
- dout(10) << "unregister_entity " << msgr->get_myaddr() << endl;
+ dout(10) << "unregister_entity " << msgr->get_myname() << endl;
// remove from local directory.
- assert(local.count(msgr->get_myaddr()));
- local.erase(msgr->get_myaddr());
- assert(entity_map.count(msgr->get_myaddr()));
- entity_map.erase(msgr->get_myaddr());
-
+ entity_name_t name = msgr->get_myname();
+ assert(local.count(name));
+ local.erase(name);
+
+ stopped.insert(name);
wait_cond.Signal();
lock.Unlock();
}
-void Rank::submit_message(Message *m, const entity_inst_t& dest_inst)
+void Rank::submit_message(Message *m, const entity_addr_t& dest_addr)
{
- const msg_addr_t dest = m->get_dest();
+ const entity_name_t dest = m->get_dest();
// lookup
EntityMessenger *entity = 0;
lock.Lock();
{
// local?
- if (dest_inst.rank == my_inst.rank) {
+ if (dest_addr == my_addr) {
if (local.count(dest)) {
// local
dout(20) << "submit_message " << *m << " dest " << dest << " local" << endl;
entity = local[dest];
}
} else {
- derr(0) << "submit_message " << *m << " dest " << dest << " " << dest_inst << " local but not in local map?" << endl;
+ derr(0) << "submit_message " << *m << " dest " << dest << " " << dest_addr << " local but not in local map?" << endl;
assert(0); // hmpf
}
}
else {
// remote.
- if (rank_pipe.count( dest_inst.rank )) {
- dout(20) << "submit_message " << *m << " dest " << dest << " remote, " << dest_inst << ", already connected." << endl;
+ if (rank_pipe.count( dest_addr )) {
+ dout(20) << "submit_message " << *m << " dest " << dest << " remote, " << dest_addr << ", already connected." << endl;
// connected.
- pipe = rank_pipe[ dest_inst.rank ];
+ pipe = rank_pipe[ dest_addr ];
} else {
- dout(20) << "submit_message " << *m << " dest " << dest << " remote, " << dest_inst << ", connecting." << endl;
+ dout(20) << "submit_message " << *m << " dest " << dest << " remote, " << dest_addr << ", connecting." << endl;
// not connected.
- pipe = connect_rank( dest_inst );
+ pipe = connect_rank( dest_addr );
}
}
}
if (local.empty()) {
dout(10) << "wait: everything stopped" << endl;
break; // everything stopped.
+ } else {
+ dout(10) << "wait: local still has " << local.size() << " items, waiting" << endl;
}
wait_cond.Wait(lock);
// done! clean up.
+ //dout(10) << "wait: stopping accepter thread" << endl;
+ //accepter.stop();
+
// stop dispatch thread
if (g_conf.ms_single_dispatch) {
dout(10) << "wait: stopping dispatch thread" << endl;
{
dout(10) << "wait: closing pipes" << endl;
list<Pipe*> toclose;
- for (hash_map<__int64_t,Pipe*>::iterator i = rank_pipe.begin();
+ for (hash_map<entity_addr_t,Pipe*>::iterator i = rank_pipe.begin();
i != rank_pipe.end();
i++)
toclose.push_back(i->second);
* EntityMessenger
*/
-Rank::EntityMessenger::EntityMessenger(msg_addr_t myaddr) :
+Rank::EntityMessenger::EntityMessenger(entity_name_t myaddr) :
Messenger(myaddr),
stop(false),
dispatch_thread(this)
{
- set_myinst(rank.my_inst);
}
Rank::EntityMessenger::~EntityMessenger()
{
{
// deliver
while (!ls.empty()) {
+ if (stop) {
+ dout(1) << "dispatch: stop=true, discarding " << ls.size()
+ << " messages in dispatch queue" << endl;
+ break;
+ }
Message *m = ls.front();
ls.pop_front();
dout(1) << m->get_dest()
- << " <-- " << m->get_source() << " " << m->get_source_inst()
+ << " <-- " << m->get_source_inst()
<< " ---- " << *m
<< " -- " << m
<< endl;
cond.Wait(lock);
}
lock.Unlock();
+
+ // deregister
+ rank.unregister_entity(this);
}
void Rank::EntityMessenger::ready()
if (g_conf.ms_single_dispatch) {
rank.lock.Lock();
- if (rank.waiting_for_ready.count(get_myaddr())) {
+ if (rank.waiting_for_ready.count(get_myname())) {
rank.single_dispatch_queue.splice(rank.single_dispatch_queue.end(),
- rank.waiting_for_ready[get_myaddr()]);
- rank.waiting_for_ready.erase(get_myaddr());
+ rank.waiting_for_ready[get_myname()]);
+ rank.waiting_for_ready.erase(get_myname());
rank.single_dispatch_cond.Signal();
}
rank.lock.Unlock();
{
dout(10) << "shutdown " << get_myaddr() << endl;
- // deregister
- rank.unregister_entity(this);
-
// stop my dispatch thread
if (dispatch_thread.am_self()) {
dout(1) << "shutdown i am dispatch, setting stop flag" << endl;
}
-void Rank::EntityMessenger::prepare_dest(const entity_inst_t& inst)
+void Rank::EntityMessenger::prepare_dest(const entity_addr_t& addr)
{
rank.lock.Lock();
{
- if (rank.rank_pipe.count(inst.rank) == 0)
- rank.connect_rank(inst);
+ if (rank.rank_pipe.count(addr) == 0)
+ rank.connect_rank(addr);
}
rank.lock.Unlock();
}
-int Rank::EntityMessenger::send_message(Message *m, msg_addr_t dest, entity_inst_t inst,
+int Rank::EntityMessenger::send_message(Message *m, entity_inst_t dest,
int port, int fromport)
{
// set envelope
- m->set_source(get_myaddr(), fromport);
- m->set_dest(dest, port);
-
- m->set_source_inst(rank.my_inst);
-
+ m->set_source(get_myname(), fromport);
+ m->set_source_addr(rank.my_addr);
+ m->set_dest(dest.name, port);
+
dout(1) << m->get_source()
- << " --> " << m->get_dest() << " " << inst
+ << " --> " << dest.name << " " << dest.addr
<< " -- " << *m
<< " -- " << m
<< endl;
- rank.submit_message(m, inst);
+ rank.submit_message(m, dest.addr);
return 0;
}
-void Rank::EntityMessenger::reset_myaddr(msg_addr_t newaddr)
+
+const entity_addr_t &Rank::EntityMessenger::get_myaddr()
+{
+ return rank.my_addr;
+}
+
+
+void Rank::EntityMessenger::reset_myname(entity_name_t newname)
{
- msg_addr_t oldaddr = get_myaddr();
- dout(10) << "set_myaddr " << oldaddr << " to " << newaddr << endl;
+ entity_name_t oldname = get_myname();
+ dout(10) << "reset_myname " << oldname << " to " << newname << endl;
- rank.entity_map.erase(oldaddr);
- rank.local.erase(oldaddr);
- rank.entity_map[newaddr] = rank.my_inst;
- rank.local[newaddr] = this;
+ rank.local.erase(oldname);
+ rank.local[newname] = this;
- _set_myaddr(newaddr);
+ _set_myname(newname);
}
-void Rank::EntityMessenger::mark_down(msg_addr_t a, entity_inst_t& i)
+void Rank::EntityMessenger::mark_down(entity_addr_t a)
{
- assert(a != get_myaddr());
- rank.mark_down(a,i);
+ rank.mark_down(a);
}
-void Rank::mark_down(msg_addr_t a, entity_inst_t& inst)
+void Rank::mark_down(entity_addr_t addr)
{
//if (my_rank == 0) return; // ugh.. rank0 already handles this stuff in the namer
lock.Lock();
+ /*
if (entity_map.count(a) &&
entity_map[a] > inst) {
dout(10) << "mark_down " << a << " inst " << inst << " < " << entity_map[a] << endl;
entity_map.erase(a);
- if (rank_pipe.count(inst.rank)) {
- rank_pipe[inst.rank]->close();
- rank_pipe.erase(inst.rank);
+ if (rank_pipe.count(inst)) {
+ rank_pipe[inst]->close();
+ rank_pipe.erase(inst);
}
}
}
+ */
lock.Unlock();
}
-void Rank::EntityMessenger::mark_up(msg_addr_t a, entity_inst_t& i)
-{
- assert(a != get_myaddr());
- rank.mark_up(a, i);
-}
-
-void Rank::mark_up(msg_addr_t a, entity_inst_t& i)
-{
- lock.Lock();
- {
- dout(10) << "mark_up " << a << " inst " << i << endl;
- derr(10) << "mark_up " << a << " inst " << i << endl;
-
- if (entity_map.count(a) == 0 ||
- entity_map[a] < i) {
- entity_map[a] = i;
- connect_rank(i);
- } else if (entity_map[a] == i) {
- dout(10) << "mark_up " << a << " inst " << i << " ... knew it" << endl;
- derr(10) << "mark_up " << a << " inst " << i << " ... knew it" << endl;
- } else {
- dout(-10) << "mark_up " << a << " inst " << i << " < " << entity_map[a] << endl;
- derr(-10) << "mark_up " << a << " inst " << i << " < " << entity_map[a] << endl;
- }
-
- //if (waiting_for_lookup.count(a))
- //lookup(a);
- }
- lock.Unlock();
-}
protected:
int sd;
bool done;
- entity_inst_t peer_inst;
+ entity_addr_t peer_addr;
bool server;
- bool sent_close;
- bool socket_error;
+ bool need_to_send_close;
bool reader_running;
bool writer_running;
void *entry() { pipe->writer(); return 0; }
} writer_thread;
friend class Writer;
-
+
public:
Pipe(int s) : sd(s),
done(false), server(true),
- sent_close(false), socket_error(false),
+ need_to_send_close(true),
reader_running(false), writer_running(false),
reader_thread(this), writer_thread(this) {
// server
reader_running = true;
reader_thread.create();
}
- Pipe(const entity_inst_t &pi) : sd(0),
- done(false), peer_inst(pi), server(false),
- sent_close(false),
- reader_running(false), writer_running(false),
- reader_thread(this), writer_thread(this) {
+ Pipe(const entity_addr_t &pi) : sd(0),
+ done(false), peer_addr(pi), server(false),
+ need_to_send_close(true),
+ reader_running(false), writer_running(false),
+ reader_thread(this), writer_thread(this) {
// client
writer_running = true;
writer_thread.create();
// public constructors
static const Pipe& Server(int s);
- static const Pipe& Client(const entity_inst_t& pi);
+ static const Pipe& Client(const entity_addr_t& pi);
- entity_inst_t& get_peer_inst() { return peer_inst; }
+ entity_addr_t& get_peer_addr() { return peer_addr; }
void close();
void join() {
}
public:
- EntityMessenger(msg_addr_t myaddr);
+ EntityMessenger(entity_name_t myaddr);
~EntityMessenger();
void ready();
dispatch_thread.join();
}
- void reset_myaddr(msg_addr_t m);
+ const entity_addr_t &get_myaddr();
+
+ void reset_myname(entity_name_t m);
- void callback_kick() {}
int shutdown();
- void prepare_dest(const entity_inst_t& inst);
- int send_message(Message *m, msg_addr_t dest, entity_inst_t inst,
+ void prepare_dest(const entity_addr_t& addr);
+ int send_message(Message *m, entity_inst_t dest,
int port=0, int fromport=0);
- void mark_down(msg_addr_t a, entity_inst_t& i);
- void mark_up(msg_addr_t a, entity_inst_t& i);
+ void mark_down(entity_addr_t a);
+ void mark_up(entity_name_t a, entity_addr_t& i);
};
bool single_dispatch_stop;
list<Message*> single_dispatch_queue;
- map<msg_addr_t, list<Message*> > waiting_for_ready;
+ map<entity_name_t, list<Message*> > waiting_for_ready;
void single_dispatcher_entry();
void _submit_single_dispatch(Message *m);
// where i listen
tcpaddr_t listen_addr;
-
- // my instance
- entity_inst_t my_inst;
+ entity_addr_t my_addr;
- // lookup
- hash_map<msg_addr_t, entity_inst_t> entity_map;
- hash_set<msg_addr_t> entity_unstarted;
-
// local
- map<msg_addr_t, EntityMessenger*> local;
+ map<entity_name_t, EntityMessenger*> local;
+ set<entity_name_t> stopped;
+ //hash_set<entity_name_t> entity_unstarted;
// remote
- hash_map<__int64_t, Pipe*> rank_pipe;
+ hash_map<entity_addr_t, Pipe*> rank_pipe;
set<Pipe*> pipes;
list<Pipe*> pipe_reap_queue;
-
- void show_dir();
-
- Pipe *connect_rank(const entity_inst_t& inst);
+
+ Pipe *connect_rank(const entity_addr_t& addr);
- void mark_down(msg_addr_t addr, entity_inst_t& i);
- void mark_up(msg_addr_t addr, entity_inst_t& i);
+ void mark_down(entity_addr_t addr);
+ //void mark_up(entity_name_t addr, entity_addr_t& i);
tcpaddr_t get_listen_addr() { return listen_addr; }
void reaper();
- EntityMessenger *find_unnamed(msg_addr_t a);
+ EntityMessenger *find_unnamed(entity_name_t a);
public:
Rank();
~Rank();
- void set_listen_addr(tcpaddr_t& a);
+ //void set_listen_addr(tcpaddr_t& a);
int start_rank();
void wait();
- EntityMessenger *register_entity(msg_addr_t addr);
- void rename_entity(EntityMessenger *ms, msg_addr_t newaddr);
+ EntityMessenger *register_entity(entity_name_t addr);
+ void rename_entity(EntityMessenger *ms, entity_name_t newaddr);
void unregister_entity(EntityMessenger *ms);
- void submit_message(Message *m, const entity_inst_t& inst);
- void prepare_dest(const entity_inst_t& inst);
+ void submit_message(Message *m, const entity_addr_t& addr);
+ void prepare_dest(const entity_addr_t& addr);
// create a new messenger
- EntityMessenger *new_entity(msg_addr_t addr);
+ EntityMessenger *new_entity(entity_name_t addr);
} ;
// pick id
int rank = m->get_rank();
- msg_addr_t entity = m->get_entity();
+ entity_name_t entity = m->get_entity();
if (entity.is_new()) {
// make up a new address!
void TCPDirectory::handle_started(Message *m)
{
- msg_addr_t entity = m->get_source();
+ entity_name_t entity = m->get_source();
dout(3) << "start signal from " << MSG_ADDR_NICE(entity) << endl;
hold.erase(entity);
void TCPDirectory::handle_unregister(Message *m)
{
- msg_addr_t who = m->get_source();
+ entity_name_t who = m->get_source();
dout(2) << "unregister from entity " << MSG_ADDR_NICE(who) << endl;
assert(dir.count(who));
else {
if (0) {
dout(10) << "dir size now " << dir.size() << endl;
- for (hash_map<msg_addr_t, int>::iterator it = dir.begin();
+ for (hash_map<entity_name_t, int>::iterator it = dir.begin();
it != dir.end();
it++) {
dout(10) << " dir: " << MSG_ADDR_NICE(it->first) << " on rank " << it->second << endl;
TCPMessenger *messenger;
// directory
- hash_map<msg_addr_t, int> dir; // entity -> rank
+ hash_map<entity_name_t, int> dir; // entity -> rank
hash_map<int, tcpaddr_t> rank_addr; // rank -> ADDR (e.g. host:port)
__uint64_t version;
- map<__uint64_t, msg_addr_t> update_log;
+ map<__uint64_t, entity_name_t> update_log;
int nrank;
int nclient, nmds, nosd;
- set<msg_addr_t> hold;
- map<msg_addr_t, list<Message*> > waiting;
+ set<entity_name_t> hold;
+ map<entity_name_t, list<Message*> > waiting;
// messages
void handle_connect(class MNSConnect*);
// local directory
-hash_map<msg_addr_t, TCPMessenger*> directory; // local
-hash_set<msg_addr_t> directory_ready;
+hash_map<entity_name_t, TCPMessenger*> directory; // local
+hash_set<entity_name_t> directory_ready;
Mutex directory_lock;
// connecting
// register
long regid = 0;
map<int, Cond* > waiting_for_register_cond;
-map<int, msg_addr_t > waiting_for_register_result;
+map<int, entity_name_t > waiting_for_register_result;
// incoming messages
list<Message*> incoming;
} single_out_thread;
Mutex lookup_lock; //
-hash_map<msg_addr_t, int> entity_rank; // entity -> rank
+hash_map<entity_name_t, int> entity_rank; // entity -> rank
hash_map<int, int> rank_sd; // outgoing sockets, rank -> sd
hash_map<int, OutThread*> rank_out;
hash_map<int, tcpaddr_t> rank_addr; // rank -> tcpaddr
-map<msg_addr_t, list<Message*> > waiting_for_lookup;
+map<entity_name_t, list<Message*> > waiting_for_lookup;
/* this process */
list<Message*> waiting;
dout(DBL) << "got lookup reply" << endl;
- for (map<msg_addr_t, int>::iterator it = m->entity_rank.begin();
+ for (map<entity_name_t, int>::iterator it = m->entity_rank.begin();
it != m->entity_rank.end();
it++) {
dout(DBL) << "lookup got " << MSG_ADDR_NICE(it->first) << " on rank " << it->second << endl;
OutThread *tcp_lookup(Message *m)
{
- msg_addr_t addr = m->get_dest();
+ entity_name_t addr = m->get_dest();
if (!entity_rank.count(m->get_dest())) {
// lookup and wait.
while (!tcp_done) {
Message *m = tcp_recv(sd);
if (!m) break;
- msg_addr_t who = m->get_source();
+ entity_name_t who = m->get_source();
dout(20) << g_clock.now() << " inthread got " << m << " from sd " << sd << " who is " << who << endl;
size_t sz = m->get_payload().length();
if (g_conf.tcp_multi_dispatch) {
- const msg_addr_t dest = m->get_dest();
+ const entity_name_t dest = m->get_dest();
directory_lock.Lock();
TCPMessenger *messenger = directory[ dest ];
directory_lock.Unlock();
}
// ok
- msg_addr_t dest = m->get_dest();
+ entity_name_t dest = m->get_dest();
directory_lock.Lock();
if (directory.count(dest)) {
Messenger *who = directory[ dest ];
-msg_addr_t register_entity(msg_addr_t addr)
+entity_name_t register_entity(entity_name_t addr)
{
lookup_lock.Lock();
cond.Wait(lookup_lock);
// get result, clean up
- msg_addr_t entity = waiting_for_register_result[id];
+ entity_name_t entity = waiting_for_register_result[id];
waiting_for_register_result.erase(id);
waiting_for_register_cond.erase(id);
*/
-TCPMessenger::TCPMessenger(msg_addr_t myaddr) :
+TCPMessenger::TCPMessenger(entity_name_t myaddr) :
Messenger(myaddr),
dispatch_thread(this)
{
return listen_addr;
}
-void TCPMessenger::map_entity_rank(msg_addr_t e, int r)
+void TCPMessenger::map_entity_rank(entity_name_t e, int r)
{
lookup_lock.Lock();
entity_rank[e] = r;
if (g_conf.tcp_multi_dispatch) {
// kill off dispatch threads
dout(DBL) << "killing dispatch threads" << endl;
- for (hash_map<msg_addr_t,TCPMessenger*>::iterator it = directory.begin();
+ for (hash_map<entity_name_t,TCPMessenger*>::iterator it = directory.begin();
it != directory.end();
it++)
it->second->dispatch_stop();
/* note: send_message _MUST_ be non-blocking */
-int TCPMessenger::send_message(Message *m, msg_addr_t dest, int port, int fromport)
+int TCPMessenger::send_message(Message *m, entity_name_t dest, int port, int fromport)
{
// set envelope
m->set_source(get_myaddr(), fromport);
}
public:
- TCPMessenger(msg_addr_t myaddr);
+ TCPMessenger(entity_name_t myaddr);
~TCPMessenger();
void ready();
tcpaddr_t& get_tcpaddr();
- void map_entity_rank(msg_addr_t e, int r);
+ void map_entity_rank(entity_name_t e, int r);
void map_rank_addr(int r, tcpaddr_t a);
int get_dispatch_queue_len();
virtual int shutdown();
// message interface
- virtual int send_message(Message *m, msg_addr_t dest, int port=0, int fromport=0);
+ virtual int send_message(Message *m, entity_name_t dest, int port=0, int fromport=0);
};
/**
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef __MSG_TYPES_H
+#define __MSG_TYPES_H
+
+#include "include/types.h"
+#include "tcp.h"
+
+// new typed msg_addr_t way!
+class entity_name_t {
+ int _type;
+ int _num;
+
+public:
+ static const int TYPE_MON = 1;
+ static const int TYPE_MDS = 2;
+ static const int TYPE_OSD = 3;
+ static const int TYPE_CLIENT = 4;
+
+ static const int NEW = -1;
+
+ // cons
+ entity_name_t() : _type(0), _num(0) {}
+ entity_name_t(int t, int n) : _type(t), _num(n) {}
+
+ int num() const { return _num; }
+ int type() const { return _type; }
+ const char *type_str() const {
+ switch (type()) {
+ case TYPE_MDS: return "mds";
+ case TYPE_OSD: return "osd";
+ case TYPE_MON: return "mon";
+ case TYPE_CLIENT: return "client";
+ default: return "unknown";
+ }
+ }
+
+ bool is_new() const { return num() == NEW; }
+
+ bool is_client() const { return type() == TYPE_CLIENT; }
+ bool is_mds() const { return type() == TYPE_MDS; }
+ bool is_osd() const { return type() == TYPE_OSD; }
+ bool is_mon() const { return type() == TYPE_MON; }
+};
+
+inline bool operator== (const entity_name_t& l, const entity_name_t& r) {
+ return (l.type() == r.type()) && (l.num() == r.num()); }
+inline bool operator!= (const entity_name_t& l, const entity_name_t& r) {
+ return (l.type() != r.type()) || (l.num() != r.num()); }
+inline bool operator< (const entity_name_t& l, const entity_name_t& r) {
+ return (l.type() < r.type()) || (l.type() == r.type() && l.num() < r.num()); }
+
+inline std::ostream& operator<<(std::ostream& out, const entity_name_t& addr) {
+ //if (addr.is_namer()) return out << "namer";
+ if (addr.is_new() || addr.num() < 0)
+ return out << addr.type_str() << "?";
+ else
+ return out << addr.type_str() << addr.num();
+}
+
+namespace __gnu_cxx {
+ template<> struct hash< entity_name_t >
+ {
+ size_t operator()( const entity_name_t m ) const
+ {
+ static blobhash H;
+ return H((const char*)&m, sizeof(m));
+ }
+ };
+}
+
+// get rid of these
+#define MSG_ADDR_MDS(x) entity_name_t(entity_name_t::TYPE_MDS,x)
+#define MSG_ADDR_OSD(x) entity_name_t(entity_name_t::TYPE_OSD,x)
+#define MSG_ADDR_MON(x) entity_name_t(entity_name_t::TYPE_MON,x)
+#define MSG_ADDR_CLIENT(x) entity_name_t(entity_name_t::TYPE_CLIENT,x)
+
+#define MSG_ADDR_RANK_NEW MSG_ADDR_RANK(entity_name_t::NEW)
+#define MSG_ADDR_MDS_NEW MSG_ADDR_MDS(entity_name_t::NEW)
+#define MSG_ADDR_OSD_NEW MSG_ADDR_OSD(entity_name_t::NEW)
+#define MSG_ADDR_CLIENT_NEW MSG_ADDR_CLIENT(entity_name_t::NEW)
+
+
+/*
+ * an entity's network address.
+ * includes a random value that prevents it from being reused.
+ * thus identifies a particular process instance.
+ * ipv4 for now.
+ */
+struct entity_addr_t {
+ __uint8_t ipq[4];
+ __uint32_t port;
+ __uint32_t nonce; // bind time, or pid, or something unique!
+
+ entity_addr_t() : port(0), nonce(0) {
+ ipq[0] = ipq[1] = ipq[2] = ipq[3] = 0;
+ }
+
+ void set_addr(tcpaddr_t a) {
+ memcpy((char*)ipq, (char*)&a.sin_addr.s_addr, 4);
+ port = a.sin_port;
+ }
+ void make_addr(tcpaddr_t& a) const {
+ a.sin_family = AF_INET;
+ memcpy((char*)&a.sin_addr.s_addr, (char*)ipq, 4);
+ a.sin_port = port;
+ }
+};
+
+inline ostream& operator<<(ostream& out, const entity_addr_t &addr)
+{
+ return out << (int)addr.ipq[0]
+ << '.' << (int)addr.ipq[1]
+ << '.' << (int)addr.ipq[2]
+ << '.' << (int)addr.ipq[3]
+ << ':' << addr.port
+ << '.' << addr.nonce;
+}
+
+inline bool operator==(const entity_addr_t& a, const entity_addr_t& b) { return memcmp(&a, &b, sizeof(a)) == 0; }
+inline bool operator!=(const entity_addr_t& a, const entity_addr_t& b) { return memcmp(&a, &b, sizeof(a)) != 0; }
+inline bool operator<(const entity_addr_t& a, const entity_addr_t& b) { return memcmp(&a, &b, sizeof(a)) < 0; }
+inline bool operator<=(const entity_addr_t& a, const entity_addr_t& b) { return memcmp(&a, &b, sizeof(a)) <= 0; }
+inline bool operator>(const entity_addr_t& a, const entity_addr_t& b) { return memcmp(&a, &b, sizeof(a)) > 0; }
+inline bool operator>=(const entity_addr_t& a, const entity_addr_t& b) { return memcmp(&a, &b, sizeof(a)) >= 0; }
+
+namespace __gnu_cxx {
+ template<> struct hash< entity_addr_t >
+ {
+ size_t operator()( const entity_addr_t& x ) const
+ {
+ static blobhash H;
+ return H((const char*)&x, sizeof(x));
+ }
+ };
+}
+
+
+/*
+ * a particular entity instance
+ */
+struct entity_inst_t {
+ entity_name_t name;
+ entity_addr_t addr;
+ entity_inst_t() {}
+ entity_inst_t(entity_name_t n, const entity_addr_t& a) : name(n), addr(a) {}
+};
+
+
+inline bool operator==(const entity_inst_t& a, const entity_inst_t& b) { return memcmp(&a, &b, sizeof(a)) == 0; }
+inline bool operator!=(const entity_inst_t& a, const entity_inst_t& b) { return memcmp(&a, &b, sizeof(a)) != 0; }
+inline bool operator<(const entity_inst_t& a, const entity_inst_t& b) { return memcmp(&a, &b, sizeof(a)) < 0; }
+inline bool operator<=(const entity_inst_t& a, const entity_inst_t& b) { return memcmp(&a, &b, sizeof(a)) <= 0; }
+inline bool operator>(const entity_inst_t& a, const entity_inst_t& b) { return memcmp(&a, &b, sizeof(a)) > 0; }
+inline bool operator>=(const entity_inst_t& a, const entity_inst_t& b) { return memcmp(&a, &b, sizeof(a)) >= 0; }
+
+namespace __gnu_cxx {
+ template<> struct hash< entity_inst_t >
+ {
+ size_t operator()( const entity_inst_t& x ) const
+ {
+ static blobhash H;
+ return H((const char*)&x, sizeof(x));
+ }
+ };
+}
+
+inline ostream& operator<<(ostream& out, const entity_inst_t &i)
+{
+ return out << i.name << " " << i.addr;
+}
+
+
+#endif
rank.start_rank(); // bind and listen
if (mpi_rank < g_conf.num_mon) {
- moninst[mpi_rank].set_addr( rank.get_listen_addr() );
+ moninst[mpi_rank].addr = rank.my_addr;
+ moninst[mpi_rank].name = MSG_ADDR_MON(mpi_rank);
//cerr << mpi_rank << " at " << rank.get_listen_addr() << endl;
}
for (int i=0; i<NUMMDS; i++) {
if (myrank != g_conf.ms_skip_rank0+i) continue;
Messenger *m = rank.register_entity(MSG_ADDR_MDS(i));
- cerr << "mds" << i << " at " << rank.my_inst << " " << hostname << "." << pid << endl;
+ cerr << "mds" << i << " at " << rank.my_addr << " " << hostname << "." << pid << endl;
mds[i] = new MDS(i, m, monmap);
mds[i]->init();
started++;
g_timer.add_event_after(kill_osd_after[i], new C_Die);
Messenger *m = rank.register_entity(MSG_ADDR_OSD(i));
- cerr << "osd" << i << " at " << rank.my_inst << " " << hostname << "." << pid << endl;
+ cerr << "osd" << i << " at " << rank.my_addr << " " << hostname << "." << pid << endl;
osd[i] = new OSD(i, m, monmap);
osd[i]->init();
started++;
nclients++;
}
if (nclients) {
- cerr << nclients << " clients at " << rank.my_inst << " " << hostname << "." << pid << endl;
+ cerr << nclients << " clients at " << rank.my_addr << " " << hostname << "." << pid << endl;
}
for (set<int>::iterator it = clientlist.begin();
if (myrank && !started) {
//dout(1) << "IDLE" << endl;
- cerr << "idle at " << rank.my_inst << " " << hostname << "." << pid << endl;
+ cerr << "idle at " << rank.my_addr << " " << hostname << "." << pid << endl;
//rank.stop_rank();
}
LogType osd_logtype;
-OSD::OSD(int id, Messenger *m, MonMap *mm, char *dev)
+OSD::OSD(int id, Messenger *m, MonMap *mm, char *dev) : timer(osd_lock)
{
whoami = id;
messenger = m;
waiting_for_no_ops = false;
if (g_conf.osd_remount_at)
- g_timer.add_event_after(g_conf.osd_remount_at, new C_Remount(this));
+ timer.add_event_after(g_conf.osd_remount_at, new C_Remount(this));
-
// init object store
// try in this order:
// announce to monitor i exist and have booted.
int mon = monmap->pick_mon();
- messenger->send_message(new MOSDBoot(superblock), MSG_ADDR_MON(mon), monmap->get_inst(mon));
+ messenger->send_message(new MOSDBoot(superblock), monmap->get_inst(mon));
// start the heart
- next_heartbeat = new C_Heartbeat(this);
- g_timer.add_event_after(g_conf.osd_heartbeat_interval, next_heartbeat);
+ timer.add_event_after(g_conf.osd_heartbeat_interval, new C_Heartbeat(this));
}
osd_lock.Unlock();
int OSD::shutdown()
{
- dout(1) << "shutdown, timer has " << g_timer.num_event << endl;
-
- if (next_heartbeat) g_timer.cancel_event(next_heartbeat);
+ dout(1) << "shutdown" << endl;
state = STATE_STOPPING;
+ // cancel timers
+ timer.cancel_all();
+ timer.join();
+
// finish ops
wait_for_no_ops();
void OSD::heartbeat()
{
- osd_lock.Lock();
-
utime_t now = g_clock.now();
utime_t since = now;
since.sec_ref() -= g_conf.osd_heartbeat_interval;
for (set<int>::iterator i = pingset.begin();
i != pingset.end();
i++) {
- _share_map_outgoing( MSG_ADDR_OSD(*i), osdmap->get_inst(*i) );
+ _share_map_outgoing( osdmap->get_inst(*i) );
messenger->send_message(new MOSDPing(osdmap->get_epoch(), avg_qlen),
- MSG_ADDR_OSD(*i), osdmap->get_inst(*i));
+ osdmap->get_inst(*i));
}
if (logger) logger->set("pingset", pingset.size());
if ((rand() % g_conf.fake_osdmap_updates) == 0) {
//if ((rand() % (g_conf.num_osd / g_conf.fake_osdmap_updates)) == whoami / g_conf.fake_osdmap_updates) {
messenger->send_message(new MOSDIn(osdmap->get_epoch()),
- MSG_ADDR_MON(mon), monmap->get_inst(mon));
+ monmap->get_inst(mon));
}
/*
if (osdmap->is_out(whoami)) {
}
// schedule next! randomly.
- next_heartbeat = new C_Heartbeat(this);
float wait = .5 + ((float)(rand() % 10)/10.0) * (float)g_conf.osd_heartbeat_interval;
- g_timer.add_event_after(wait, next_heartbeat);
-
- osd_lock.Unlock();
+ timer.add_event_after(wait, new C_Heartbeat(this));
}
// --------------------------------------
// dispatch
-bool OSD::_share_map_incoming(msg_addr_t who, const entity_inst_t& inst, epoch_t epoch)
+bool OSD::_share_map_incoming(const entity_inst_t& inst, epoch_t epoch)
{
bool shared = false;
// does client have old map?
- if (who.is_client()) {
+ if (inst.name.is_client()) {
if (epoch < osdmap->get_epoch()) {
- dout(10) << who << " has old map " << epoch << " < " << osdmap->get_epoch() << endl;
- send_incremental_map(epoch, who, inst, true);
+ dout(10) << inst.name << " has old map " << epoch << " < " << osdmap->get_epoch() << endl;
+ send_incremental_map(epoch, inst, true);
shared = true;
}
}
// does peer have old map?
- if (who.is_osd()) {
+ if (inst.name.is_osd()) {
// remember
- if (peer_map_epoch[who] < epoch)
- peer_map_epoch[who] = epoch;
+ if (peer_map_epoch[inst.name] < epoch)
+ peer_map_epoch[inst.name] = epoch;
// older?
- if (peer_map_epoch[who] < osdmap->get_epoch()) {
- dout(10) << who << " has old map " << epoch << " < " << osdmap->get_epoch() << endl;
- send_incremental_map(epoch, who, inst, true);
- peer_map_epoch[who] = osdmap->get_epoch(); // so we don't send it again.
+ if (peer_map_epoch[inst.name] < osdmap->get_epoch()) {
+ dout(10) << inst.name << " has old map " << epoch << " < " << osdmap->get_epoch() << endl;
+ send_incremental_map(epoch, inst, true);
+ peer_map_epoch[inst.name] = osdmap->get_epoch(); // so we don't send it again.
shared = true;
}
}
}
-void OSD::_share_map_outgoing(msg_addr_t dest, const entity_inst_t& inst)
+void OSD::_share_map_outgoing(const entity_inst_t& inst)
{
- assert(dest.is_osd());
+ assert(inst.name.is_osd());
- if (dest.is_osd()) {
+ if (inst.name.is_osd()) {
// send map?
- if (peer_map_epoch.count(dest)) {
- epoch_t pe = peer_map_epoch[dest];
+ if (peer_map_epoch.count(inst.name)) {
+ epoch_t pe = peer_map_epoch[inst.name];
if (pe < osdmap->get_epoch()) {
- send_incremental_map(pe, dest, inst, true);
- peer_map_epoch[dest] = osdmap->get_epoch();
+ send_incremental_map(pe, inst, true);
+ peer_map_epoch[inst.name] = osdmap->get_epoch();
}
} else {
// no idea about peer's epoch.
}
-void OSD::ms_handle_failure(Message *m, msg_addr_t dest, const entity_inst_t& inst)
+void OSD::ms_handle_failure(Message *m, const entity_inst_t& inst)
{
+ entity_name_t dest = inst.name;
+
if (g_conf.ms_die_on_failure) {
exit(0);
}
dout(0) << "ms_handle_failure " << dest << " inst " << inst
<< ", dropping and reporting to mon" << mon
<< endl;
- messenger->send_message(new MOSDFailure(dest, inst, osdmap->get_epoch()),
- MSG_ADDR_MON(mon), monmap->get_inst(mon));
+ messenger->send_message(new MOSDFailure(inst, osdmap->get_epoch()),
+ monmap->get_inst(mon));
delete m;
} else if (dest.is_mon()) {
// resend to a different monitor.
dout(0) << "ms_handle_failure " << dest << " inst " << inst
<< ", resending to mon" << mon
<< endl;
- messenger->send_message(m, MSG_ADDR_MON(mon), monmap->get_inst(mon));
+ messenger->send_message(m, monmap->get_inst(mon));
}
else {
// client?
}
}
-bool OSD::ms_lookup(msg_addr_t dest, entity_inst_t& inst)
-{
- if (dest.is_osd()) {
- assert(osdmap);
- return osdmap->get_inst(dest.num(), inst);
- }
-
- assert(0);
- return false;
-}
-
void OSD::handle_osd_ping(MOSDPing *m)
{
dout(20) << "osdping from " << m->get_source() << endl;
- _share_map_incoming(m->get_source(), m->get_source_inst(), ((MOSDPing*)m)->map_epoch);
+ _share_map_incoming(m->get_source_inst(), ((MOSDPing*)m)->map_epoch);
int from = m->get_source().num();
peer_qlen[from] = m->avg_qlen;
if (waiting_for_osdmap.empty()) {
int mon = monmap->pick_mon();
messenger->send_message(new MOSDGetMap(osdmap->get_epoch()),
- MSG_ADDR_MON(mon), monmap->get_inst(mon));
+ monmap->get_inst(mon));
}
waiting_for_osdmap.push_back(m);
i++) {
int osd = i->first;
if (osd == whoami) continue;
- messenger->mark_down(MSG_ADDR_OSD(osd), i->second);
+ messenger->mark_down(i->second.addr);
peer_map_epoch.erase(MSG_ADDR_OSD(osd));
// kick any replica ops
i != inc.new_up.end();
i++) {
if (i->first == whoami) continue;
- messenger->mark_up(MSG_ADDR_OSD(i->first), i->second);
peer_map_epoch.erase(MSG_ADDR_OSD(i->first));
}
}
else {
dout(10) << "handle_osd_map missing epoch " << cur+1 << endl;
int mon = monmap->pick_mon();
- messenger->send_message(new MOSDGetMap(cur), MSG_ADDR_MON(mon), monmap->get_inst(mon));
+ messenger->send_message(new MOSDGetMap(cur), monmap->get_inst(mon));
break;
}
}
-void OSD::send_incremental_map(epoch_t since, msg_addr_t dest, const entity_inst_t& inst, bool full)
+void OSD::send_incremental_map(epoch_t since, const entity_inst_t& inst, bool full)
{
dout(10) << "send_incremental_map " << since << " -> " << osdmap->get_epoch()
- << " to " << dest << endl;
+ << " to " << inst << endl;
MOSDMap *m = new MOSDMap;
}
}
- messenger->send_message(m, dest, inst);
+ messenger->send_message(m, inst);
}
bool OSD::get_map_bl(epoch_t e, bufferlist& bl)
}
dout(7) << "do_notify osd" << it->first << " on " << it->second.size() << " PGs" << endl;
MOSDPGNotify *m = new MOSDPGNotify(osdmap->get_epoch(), it->second);
- _share_map_outgoing(MSG_ADDR_OSD(it->first), osdmap->get_inst(it->first));
- messenger->send_message(m, MSG_ADDR_OSD(it->first), osdmap->get_inst(it->first));
+ _share_map_outgoing(osdmap->get_inst(it->first));
+ messenger->send_message(m, osdmap->get_inst(it->first));
}
}
MOSDPGQuery *m = new MOSDPGQuery(osdmap->get_epoch(),
pit->second);
- _share_map_outgoing(MSG_ADDR_OSD(who), osdmap->get_inst(who));
- messenger->send_message(m, MSG_ADDR_OSD(who), osdmap->get_inst(who));
+ _share_map_outgoing(osdmap->get_inst(who));
+ messenger->send_message(m, osdmap->get_inst(who));
}
}
dout(10) << *pg << " sending " << m->log << " " << m->missing << endl;
//m->log.print(cout);
- _share_map_outgoing(MSG_ADDR_OSD(from), osdmap->get_inst(from));
- messenger->send_message(m, MSG_ADDR_OSD(from), osdmap->get_inst(from));
+ _share_map_outgoing(osdmap->get_inst(from));
+ messenger->send_message(m, osdmap->get_inst(from));
}
_unlock_pg(pgid);
// send op
tid_t tid = ++last_tid;
- MOSDOp *op = new MOSDOp(tid, messenger->get_myaddr(),
+ MOSDOp *op = new MOSDOp(messenger->get_myinst(), 0, tid,
oid, pg->get_pgid(),
osdmap->get_epoch(),
OSD_OP_PULL);
op->set_version(v);
- messenger->send_message(op, MSG_ADDR_OSD(osd), osdmap->get_inst(osd));
+ messenger->send_message(op, osdmap->get_inst(osd));
// take note
assert(pg->objects_pulling.count(oid) == 0);
logger->inc("r_pushb", bl.length());
// send
- MOSDOp *op = new MOSDOp(++last_tid, MSG_ADDR_OSD(whoami),
+ MOSDOp *op = new MOSDOp(messenger->get_myinst(), 0, ++last_tid,
oid, pg->info.pgid, osdmap->get_epoch(),
OSD_OP_PUSH);
op->set_offset(0);
op->set_version(v);
op->set_attrset(attrset);
- messenger->send_message(op, MSG_ADDR_OSD(dest), osdmap->get_inst(dest));
+ messenger->send_message(op, osdmap->get_inst(dest));
}
<< endl;
MOSDOpReply *commit = new MOSDOpReply(op, 0, osdmap->get_epoch(), true);
commit->set_pg_complete_thru(last_complete);
- messenger->send_message(commit, MSG_ADDR_OSD(ackerosd), osdmap->get_inst(ackerosd));
+ messenger->send_message(commit, osdmap->get_inst(ackerosd));
delete op;
}
// send ack to acker?
if (g_conf.osd_rep != OSD_REP_CHAIN) {
MOSDOpReply *ack = new MOSDOpReply(op, 0, osdmap->get_epoch(), false);
- messenger->send_message(ack, MSG_ADDR_OSD(ackerosd), osdmap->get_inst(ackerosd));
+ messenger->send_message(ack, osdmap->get_inst(ackerosd));
}
// ack myself.
if (!require_same_or_newer_map(op, op->get_map_epoch())) return;
// share our map with sender, if they're old
- _share_map_incoming(op->get_source(), op->get_source_inst(), op->get_map_epoch());
+ _share_map_incoming(op->get_source_inst(), op->get_map_epoch());
// what kind of op?
bool read = op->get_op() < 10; // read, stat. but not pull.
if (pg->acting.size() > 1) {
int peer = pg->acting[1];
dout(-10) << "fwd client read op to osd" << peer << " for " << op->get_client() << " " << op->get_client_inst() << endl;
- messenger->send_message(op, MSG_ADDR_OSD(peer), osdmap->get_inst(peer));
+ messenger->send_message(op, osdmap->get_inst(peer));
return;
}
}
<< ", fwd to peer w/ qlen " << peer_qlen[peer]
<< " osd" << peer
<< endl;
- messenger->send_message(op, MSG_ADDR_OSD(peer), osdmap->get_inst(peer));
+ messenger->send_message(op, osdmap->get_inst(peer));
return;
}
}
if (!require_same_or_newer_map(op, op->get_map_epoch())) return;
// share our map with sender, if they're old
- _share_map_incoming(op->get_source(), op->get_source_inst(), op->get_map_epoch());
+ _share_map_incoming(op->get_source_inst(), op->get_map_epoch());
if (!pg) {
// hmm.
{
object_t oid = op->get_oid();
- msg_addr_t source;
- int len = store->getattr(oid, "wrlock", &source, sizeof(msg_addr_t));
+ entity_name_t source;
+ int len = store->getattr(oid, "wrlock", &source, sizeof(entity_name_t));
//cout << "getattr returns " << len << " on " << oid << endl;
if (len == sizeof(source) &&
if (r >= 0) logger->inc("rdb", r);
// send it
- messenger->send_message(reply, op->get_client(), op->get_client_inst());
+ messenger->send_message(reply, op->get_client_inst());
delete op;
}
MOSDOpReply *reply = new MOSDOpReply(op, r, osdmap->get_epoch(), true);
reply->set_object_size(st.st_size);
- messenger->send_message(reply, op->get_client(), op->get_client_inst());
+ messenger->send_message(reply, op->get_client_inst());
logger->inc("stat");
// send commit.
MOSDOpReply *reply = new MOSDOpReply(repop->op, 0, osdmap->get_epoch(), true);
dout(10) << "put_repop sending commit on " << *repop << " " << reply << endl;
- messenger->send_message(reply, repop->op->get_client(), repop->op->get_client_inst());
+ messenger->send_message(reply, repop->op->get_client_inst());
repop->sent_commit = true;
}
// send ack
MOSDOpReply *reply = new MOSDOpReply(repop->op, 0, osdmap->get_epoch(), false);
dout(10) << "put_repop sending ack on " << *repop << " " << reply << endl;
- messenger->send_message(reply, repop->op->get_client(), repop->op->get_client_inst());
+ messenger->send_message(reply, repop->op->get_client_inst());
repop->sent_ack = true;
utime_t now = g_clock.now();
<< endl;
// forward the write/update/whatever
- MOSDOp *wr = new MOSDOp(op->get_tid(),
- op->get_client(),
+ MOSDOp *wr = new MOSDOp(op->get_client_inst(), op->get_client_inc(), op->get_reqid().tid,
oid,
pg->get_pgid(),
osdmap->get_epoch(),
wr->set_rep_tid(op->get_rep_tid());
wr->set_pg_trim_to(pg->peers_complete_thru);
- messenger->send_message(wr, MSG_ADDR_OSD(osd), osdmap->get_inst(osd));
+ messenger->send_message(wr, osdmap->get_inst(osd));
}
PG::RepOpGather *OSD::new_repop_gather(PG *pg,
}
// dup op?
- reqid_t reqid(op->get_client(), op->get_tid());
- if (pg->log.logged_req(reqid)) {
- dout(-3) << "op_modify " << opname << " dup op " << reqid
+ if (pg->log.logged_req(op->get_reqid())) {
+ dout(-3) << "op_modify " << opname << " dup op " << op->get_reqid()
<< ", doing WRNOOP" << endl;
op->set_op(OSD_OP_WRNOOP);
opname = MOSDOp::get_opname(op->get_op());
{
for (unsigned i=1; i<pg->acting.size(); i++) {
int osd = pg->acting[i];
- _share_map_outgoing( MSG_ADDR_OSD(osd), osdmap->get_inst(osd) );
+ _share_map_outgoing( osdmap->get_inst(osd) );
}
}
osd_lock.Unlock();
if (crev && rev && rev > crev) {
eversion_t cv = version;
cv.version--;
- PG::Log::Entry cloneentry(PG::Log::Entry::CLONE, oid, cv,
- op->get_client(), op->get_tid());
+ PG::Log::Entry cloneentry(PG::Log::Entry::CLONE, oid, cv, op->get_reqid());
pg->log.add(cloneentry);
dout(10) << "prepare_log_transaction " << op->get_op()
// actual op
int opcode = PG::Log::Entry::MODIFY;
if (op->get_op() == OSD_OP_DELETE) opcode = PG::Log::Entry::DELETE;
- PG::Log::Entry logentry(opcode, oid, version,
- op->get_client(), op->get_tid());
+ PG::Log::Entry logentry(opcode, oid, version, op->get_reqid());
dout(10) << "prepare_log_transaction " << op->get_op()
<< " " << logentry
case OSD_OP_WRLOCK:
{ // lock object
//r = store->setattr(oid, "wrlock", &op->get_asker(), sizeof(msg_addr_t), oncommit);
- t.setattr(oid, "wrlock", &op->get_client(), sizeof(msg_addr_t));
+ t.setattr(oid, "wrlock", &op->get_client(), sizeof(entity_name_t));
}
break;
#include "common/Mutex.h"
#include "common/ThreadPool.h"
+#include "common/Timer.h"
#include "mon/MonMap.h"
void finish(int r) {
osd->heartbeat();
}
- } *next_heartbeat;
+ };
// global lock
Mutex osd_lock;
+ SafeTimer timer;
// -- stats --
int hb_stat_ops; // ops since last heartbeat
class OSDMap *osdmap;
list<class Message*> waiting_for_osdmap;
- hash_map<msg_addr_t, epoch_t> peer_map_epoch;
- bool _share_map_incoming(msg_addr_t who, const entity_inst_t& inst, epoch_t epoch);
- void _share_map_outgoing(msg_addr_t dest, const entity_inst_t& inst);
+ hash_map<entity_name_t, epoch_t> peer_map_epoch; // FIXME types
+ bool _share_map_incoming(const entity_inst_t& inst, epoch_t epoch);
+ void _share_map_outgoing(const entity_inst_t& inst);
void wait_for_new_map(Message *m);
void handle_osd_map(class MOSDMap *m);
bool get_inc_map_bl(epoch_t e, bufferlist& bl);
bool get_inc_map(epoch_t e, OSDMap::Incremental &inc);
- void send_incremental_map(epoch_t since, msg_addr_t dest, const entity_inst_t& inst, bool full);
+ void send_incremental_map(epoch_t since, const entity_inst_t& inst, bool full);
// messages
virtual void dispatch(Message *m);
- virtual void ms_handle_failure(Message *m, msg_addr_t dest, const entity_inst_t& inst);
- virtual bool ms_lookup(msg_addr_t dest, entity_inst_t& inst);
+ virtual void ms_handle_failure(Message *m, const entity_inst_t& inst);
void handle_osd_ping(class MOSDPing *m);
void handle_op(class MOSDOp *m);
*/
#include "config.h"
#include "include/types.h"
+#include "osd_types.h"
#include "msg/Message.h"
#include "common/Mutex.h"
#include "common/Clock.h"
#define __OBJECTSTORE_H
#include "include/types.h"
+#include "osd_types.h"
#include "include/Context.h"
#include "include/buffer.h"
if (is_crashed()) {
dout(10) << "crashed, allowing op replay for " << g_conf.osd_replay_window << endl;
state_set(STATE_REPLAY);
- g_timer.add_event_after(g_conf.osd_replay_window,
- new OSD::C_Activate(osd, info.pgid, osd->osdmap->get_epoch()));
+ osd->timer.add_event_after(g_conf.osd_replay_window,
+ new OSD::C_Activate(osd, info.pgid, osd->osdmap->get_epoch()));
}
else if (!is_active()) {
// -- ok, activate!
dout(10) << "activate sending " << m->log << " " << m->missing
<< " to osd" << peer << endl;
//m->log.print(cout);
- osd->messenger->send_message(m, MSG_ADDR_OSD(peer), osd->osdmap->get_inst(peer));
+ osd->messenger->send_message(m, osd->osdmap->get_inst(peer));
// update our missing
if (peer_missing[peer].num_missing() == 0) {
ls.push_back(info);
osd->messenger->send_message(new MOSDPGNotify(osd->osdmap->get_epoch(),
ls),
- MSG_ADDR_OSD(get_primary()), osd->osdmap->get_inst(get_primary()));
+ osd->osdmap->get_inst(get_primary()));
}
return false;
set<pg_t> ls;
ls.insert(info.pgid);
MOSDPGRemove *m = new MOSDPGRemove(osd->osdmap->get_epoch(), ls);
- osd->messenger->send_message(m, MSG_ADDR_OSD(*p), osd->osdmap->get_inst(*p));
+ osd->messenger->send_message(m, osd->osdmap->get_inst(*p));
}
stray_set.clear();
class OSD;
-/* reqid_t - caller + tid to unique identify this request
- */
-class reqid_t {
-public:
- msg_addr_t addr;
- tid_t tid;
- reqid_t() : tid(0) {}
- reqid_t(const msg_addr_t& a, tid_t t) : addr(a), tid(t) {}
-};
-
-inline ostream& operator<<(ostream& out, const reqid_t& r) {
- return out << r.addr << "." << r.tid;
-}
-inline bool operator==(const reqid_t& l, const reqid_t& r) {
- return (l.addr == r.addr) && (l.tid == r.tid);
-}
-inline bool operator!=(const reqid_t& l, const reqid_t& r) {
- return (l.addr != r.addr) || (l.tid != r.tid);
-}
-namespace __gnu_cxx {
- template<> struct hash<reqid_t> {
- size_t operator()(const reqid_t &r) const {
- static hash<unsigned long> H;
- static hash<__uint64_t> I;
- return H(r.addr.type() ^ r.addr.num()) ^ I(r.tid);
- }
- };
-}
/** PG - Replica Placement Group
*
eversion_t version;
objectrev_t rev;
- reqid_t reqid; // caller+tid to uniquely identify request
+ reqid_t reqid; // caller+tid to uniquely identify request
Entry() : op(0) {}
Entry(int _op, object_t _oid, const eversion_t& v,
- const msg_addr_t& a, tid_t t) :
- op(_op), oid(_oid), version(v), reqid(a,t) {}
+ const reqid_t& rid) :
+ op(_op), oid(_oid), version(v), reqid(rid) {}
bool is_delete() const { return op == DELETE; }
bool is_clone() const { return op == CLONE; }
class IndexedLog : public Log {
public:
hash_map<object_t,Entry*> objects; // ptrs into log. be careful!
- hash_set<reqid_t> caller_ops;
+ hash_set<reqid_t> caller_ops;
// recovery pointers
list<Entry>::iterator requested_to; // not inclusive of referenced item
bool logged_object(object_t oid) {
return objects.count(oid);
}
- bool logged_req(reqid_t &r) {
+ bool logged_req(const reqid_t &r) {
return caller_ops.count(r);
}
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef __OSD_TYPES_H
+#define __OSD_TYPES_H
+
+#include "include/reqid.h"
+
+// osd types
+typedef __uint64_t coll_t; // collection id
+
+
+// pg stuff
+typedef __uint16_t ps_t;
+typedef __uint8_t pruleset_t;
+
+// placement group id
+struct pg_t {
+ union {
+ struct {
+ int preferred;
+ ps_t ps;
+ __uint8_t nrep;
+ pruleset_t ruleset;
+ } fields;
+ __uint64_t val;
+ } u;
+ pg_t() { u.val = 0; }
+ pg_t(const pg_t& o) { u.val = o.u.val; }
+ pg_t(ps_t s, int p, unsigned char n, pruleset_t r=0) {
+ u.fields.ps = s;
+ u.fields.preferred = p;
+ u.fields.nrep = n;
+ u.fields.ruleset = r;
+ }
+ pg_t(__uint64_t v) { u.val = v; }
+ /*
+ pg_t operator=(__uint64_t v) { u.val = v; return *this; }
+ pg_t operator&=(__uint64_t v) { u.val &= v; return *this; }
+ pg_t operator+=(pg_t o) { u.val += o.val; return *this; }
+ pg_t operator-=(pg_t o) { u.val -= o.val; return *this; }
+ pg_t operator++() { ++u.val; return *this; }
+ */
+ operator __uint64_t() const { return u.val; }
+};
+
+inline ostream& operator<<(ostream& out, pg_t pg) {
+ //return out << hex << pg.val << dec;
+ if (pg.u.fields.ruleset)
+ out << (int)pg.u.fields.ruleset << '.';
+ out << (int)pg.u.fields.nrep << '.';
+ if (pg.u.fields.preferred)
+ out << pg.u.fields.preferred << '.';
+ out << hex << pg.u.fields.ps << dec;
+ return out;
+}
+
+namespace __gnu_cxx {
+ template<> struct hash< pg_t >
+ {
+ size_t operator()( const pg_t& x ) const
+ {
+ static hash<__uint64_t> H;
+ return H(x);
+ }
+ };
+}
+
+
+
+// compound rados version type
+class eversion_t {
+public:
+ epoch_t epoch;
+ version_t version;
+ eversion_t(epoch_t e=0, version_t v=0) : epoch(e), version(v) {}
+};
+
+inline bool operator==(const eversion_t& l, const eversion_t& r) {
+ return (l.epoch == r.epoch) && (l.version == r.version);
+}
+inline bool operator!=(const eversion_t& l, const eversion_t& r) {
+ return (l.epoch != r.epoch) || (l.version != r.version);
+}
+inline bool operator<(const eversion_t& l, const eversion_t& r) {
+ return (l.epoch == r.epoch) ? (l.version < r.version):(l.epoch < r.epoch);
+}
+inline bool operator<=(const eversion_t& l, const eversion_t& r) {
+ return (l.epoch == r.epoch) ? (l.version <= r.version):(l.epoch <= r.epoch);
+}
+inline bool operator>(const eversion_t& l, const eversion_t& r) {
+ return (l.epoch == r.epoch) ? (l.version > r.version):(l.epoch > r.epoch);
+}
+inline bool operator>=(const eversion_t& l, const eversion_t& r) {
+ return (l.epoch == r.epoch) ? (l.version >= r.version):(l.epoch >= r.epoch);
+}
+inline ostream& operator<<(ostream& out, const eversion_t e) {
+ return out << e.epoch << "'" << e.version;
+}
+
+
+
+
+
+// -----------------------------------------
+
+class ObjectExtent {
+ public:
+ object_t oid; // object id
+ off_t start; // in object
+ size_t length; // in object
+
+ objectrev_t rev; // which revision?
+ pg_t pgid; // where to find the object
+
+ map<size_t, size_t> buffer_extents; // off -> len. extents in buffer being mapped (may be fragmented bc of striping!)
+
+ ObjectExtent() : start(0), length(0), rev(0), pgid(0) {}
+ ObjectExtent(object_t o, off_t s=0, size_t l=0) : oid(o), start(s), length(l), rev(0), pgid(0) { }
+};
+
+inline ostream& operator<<(ostream& out, ObjectExtent &ex)
+{
+ return out << "extent("
+ << ex.oid << " in " << hex << ex.pgid << dec
+ << " " << ex.start << "~" << ex.length
+ << ")";
+}
+
+
+
+// ---------------------------------------
+
+class OSDSuperblock {
+public:
+ const static __uint64_t MAGIC = 0xeb0f505dULL;
+ __uint64_t magic;
+ __uint64_t fsid; // unique fs id (random number)
+ int whoami; // my role in this fs.
+ epoch_t current_epoch; // most recent epoch
+ epoch_t oldest_map, newest_map; // oldest/newest maps we have.
+ OSDSuperblock(__uint64_t f=0, int w=0) :
+ magic(MAGIC), fsid(f), whoami(w),
+ current_epoch(0), oldest_map(0), newest_map(0) {}
+};
+
+inline ostream& operator<<(ostream& out, OSDSuperblock& sb)
+{
+ return out << "sb(fsid " << sb.fsid
+ << " osd" << sb.whoami
+ << " e" << sb.current_epoch
+ << " [" << sb.oldest_map << "," << sb.newest_map
+ << "])";
+}
+
+
+#endif
Header h;
assert(bl.length() == sizeof(h));
bl.copy(0, sizeof(h), (char*)&h);
- dout(1) << "_finish_read_head " << h << ". probing for end of log (from " << write_pos << ")..." << endl;
write_pos = flush_pos = ack_pos = h.write_pos;
read_pos = requested_pos = received_pos = h.read_pos;
expire_pos = h.expire_pos;
trimmed_pos = trimming_pos = h.trimmed_pos;
+ dout(1) << "_finish_read_head " << h << ". probing for end of log (from " << write_pos << ")..." << endl;
+
// probe the log
state = STATE_PROBING;
C_ProbeEnd *fin = new C_ProbeEnd(this);
void Journaler::_finish_probe_end(int r, off_t end)
{
- assert(r >= 0);
- assert(end >= write_pos);
assert(state == STATE_PROBING);
-
- dout(1) << "_finish_probe_end write_pos = " << end
- << " (header had " << write_pos << "). recovered."
- << endl;
+ if (end == -1) {
+ end = write_pos;
+ dout(1) << "_finish_probe_end write_pos = " << end
+ << " (header had " << write_pos << "). log was empty. recovered."
+ << endl;
+ assert(0); // hrm.
+ } else {
+ assert(end >= write_pos);
+ assert(r >= 0);
+ dout(1) << "_finish_probe_end write_pos = " << end
+ << " (header had " << write_pos << "). recovered."
+ << endl;
+ }
+
write_pos = flush_pos = ack_pos = end;
// done.
for (map<int,entity_inst_t>::iterator i = inc.new_down.begin();
i != inc.new_down.end();
i++)
- messenger->mark_down(MSG_ADDR_OSD(i->first), i->second);
- for (map<int,entity_inst_t>::iterator i = inc.new_up.begin();
- i != inc.new_up.end();
- i++)
- messenger->mark_up(MSG_ADDR_OSD(i->first), i->second);
+ messenger->mark_down(i->second.addr);
}
else if (m->maps.count(e)) {
dout(3) << "handle_osd_map requesting missing epoch " << osdmap->get_epoch()+1 << endl;
int mon = monmap->pick_mon();
messenger->send_message(new MOSDGetMap(osdmap->get_epoch()),
- MSG_ADDR_MON(mon), monmap->get_inst(mon));
+ monmap->get_inst(mon));
break;
}
// send
last_tid++;
- MOSDOp *m = new MOSDOp(last_tid, messenger->get_myaddr(),
+ assert(client_inc >= 0);
+ MOSDOp *m = new MOSDOp(messenger->get_myinst(), client_inc, last_tid,
ex.oid, ex.pgid, osdmap->get_epoch(),
OSD_OP_STAT);
dout(10) << "stat_submit " << st << " tid " << last_tid
<< endl;
if (pg.acker() >= 0)
- messenger->send_message(m, MSG_ADDR_OSD(pg.acker()), osdmap->get_inst(pg.acker()));
+ messenger->send_message(m, osdmap->get_inst(pg.acker()));
// add to gather set
st->tid = last_tid;
// send
last_tid++;
- MOSDOp *m = new MOSDOp(last_tid, messenger->get_myaddr(),
+ assert(client_inc >= 0);
+ MOSDOp *m = new MOSDOp(messenger->get_myinst(), client_inc, last_tid,
ex.oid, ex.pgid, osdmap->get_epoch(),
OSD_OP_READ);
m->set_length(ex.length);
<< endl;
if (pg.acker() >= 0)
- messenger->send_message(m, MSG_ADDR_OSD(pg.acker()), osdmap->get_inst(pg.acker()));
+ messenger->send_message(m, osdmap->get_inst(pg.acker()));
// add to gather set
rd->ops[last_tid] = ex;
else
tid = ++last_tid;
- MOSDOp *m = new MOSDOp(tid, messenger->get_myaddr(),
+ MOSDOp *m = new MOSDOp(messenger->get_myinst(), client_inc, tid,
ex.oid, ex.pgid, osdmap->get_epoch(),
wr->op);
m->set_length(ex.length);
<< " osd" << pg.primary()
<< endl;
if (pg.primary() >= 0)
- messenger->send_message(m, MSG_ADDR_OSD(pg.primary()), osdmap->get_inst(pg.primary()));
+ messenger->send_message(m, osdmap->get_inst(pg.primary()));
dout(5) << num_unacked << " unacked, " << num_uncommitted << " uncommitted" << endl;
-void Objecter::ms_handle_failure(Message *m, msg_addr_t dest, const entity_inst_t& inst)
+void Objecter::ms_handle_failure(Message *m, entity_name_t dest, const entity_inst_t& inst)
{
if (dest.is_mon()) {
// try a new mon
dout(0) << "ms_handle_failure " << dest << " inst " << inst
<< ", resending to mon" << mon
<< endl;
- messenger->send_message(m, MSG_ADDR_MON(mon), monmap->get_inst(mon));
+ messenger->send_message(m, monmap->get_inst(mon));
}
else if (dest.is_osd()) {
int mon = monmap->pick_mon();
dout(0) << "ms_handle_failure " << dest << " inst " << inst
<< ", dropping and reporting to mon" << mon
<< endl;
- messenger->send_message(new MOSDFailure(dest, inst, osdmap->get_epoch()),
- MSG_ADDR_MON(mon), monmap->get_inst(mon));
+ messenger->send_message(new MOSDFailure(inst, osdmap->get_epoch()),
+ monmap->get_inst(mon));
delete m;
} else {
dout(0) << "ms_handle_failure " << dest << " inst " << inst
private:
tid_t last_tid;
+ int client_inc;
int num_unacked;
int num_uncommitted;
public:
Objecter(Messenger *m, MonMap *mm, OSDMap *om) :
messenger(m), monmap(mm), osdmap(om),
- last_tid(0),
+ last_tid(0), client_inc(-1),
num_unacked(0), num_uncommitted(0)
{}
~Objecter() {
return !(op_read.empty() && op_modify.empty());
}
+ void set_client_incarnation(int inc) {
+ client_inc = inc;
+ }
+
// med level
tid_t readx(OSDRead *read, Context *onfinish);
tid_t modifyx(OSDModify *wr, Context *onack, Context *oncommit);
tid_t lock(int op, object_t oid, Context *onack, Context *oncommit);
- void ms_handle_failure(Message *m, msg_addr_t dest, const entity_inst_t& inst);
+ void ms_handle_failure(Message *m, entity_name_t dest, const entity_inst_t& inst);
};