+#
+# until autoconf is set up, here are the options i understand:
+#
+# darwin=yes -- build on darwin
+# fuse=no -- don't build anything requiring FUSE
+# mpi=no -- don't build newsyn (require MPI)
+# use_ccpp=yes -- use Common C++ for buffer.h reference counting
+# want_bdb=yes -- build berkelydb objectstore
+#
# mpicxx must be on your path to build newsyn.
# on googoo, this means that /usr/local/mpich2-1.0.2/bin must be on your path.
# on issdm, it's /usr/local/mpich2/bin.
# Hook for extra -I options, etc.
-EXTRA_CFLAGS =
+EXTRA_CFLAGS = -I${HOME}/include -L${HOME}/lib
+
+# base
+CFLAGS = -pg -g -Wall -I. -D_FILE_OFFSET_BITS=64 -D_REENTRANT -D_THREAD_SAFE ${EXTRA_CFLAGS}
+LDINC = ld -i -o
+CC = g++
+LIBS = -pthread
+# darwin?
ifeq ($(target),darwin)
-# For Darwin
-CFLAGS = -g -Wall -I. -D_FILE_OFFSET_BITS=64 -D_REENTRANT -D_THREAD_SAFE -DDARWIN -D__FreeBSD__=10 ${EXTRA_CFLAGS}
+CFLAGS += -DDARWIN -D__FreeBSD__=10
LDINC = ar -rc
-else
-# For linux
-CFLAGS = -pg -g -Wall -I. -D_FILE_OFFSET_BITS=64 -D_REENTRANT -D_THREAD_SAFE
-LDINC = ld -i -o
endif
-CC = g++
-LIBS = -lpthread
-
-ifeq ($(want_bdb),yes)
-CFLAGS += -DUSE_OSBDB
-OSBDB_LIBS = -ldb_cxx
+# use Common C++ (for buffer.h)?
+ifeq ($(use_ccpp),yes)
+CFLAGS += -D_GNU_SOURCE -DBUFFER_USE_CCPP
+LIBS += -lccgnu2 -ldl
endif
+
#for normal mpich2 machines
MPICC = mpicxx
MPICFLAGS = -DMPICH_IGNORE_CXX_SEEK ${CFLAGS}
client/Trace.o
+# bdbstore?
ifeq ($(want_bdb),yes)
+CFLAGS += -DUSE_OSBDB
+LIBS = -ldb_cxx
+OSD_OBJS += osbdb/OSBDB.o
OSBDB_OBJS = \
osbdb/OSBDB.o
-
-OSBDB_OBJ = osbdb.o
endif
-TARGETS = cmon cosd cmds csyn newsyn fakesyn mkmonmap cmonctl cfuse fakefuse
-NO_FUSE = cmon cosd cmds csyn newsyn fakesyn mkmonmap
-
+# targets
+TARGETS = cmon cosd cmds csyn mkmonmap cmonctl fakesyn
SRCS=*.cc */*.cc *.h */*.h */*/*.h
-all: depend ${TARGETS}
+ifneq ($(fuse),no)
+TARGETS += cfuse fakefuse
+endif
-nofuse: depend ${NO_FUSE}
+ifneq ($(mpi),no)
+TARGETS += newsyn
+endif
+
+all: depend ${TARGETS}
test: depend ${TEST_TARGETS}
cmonctl: cmonctl.cc msg/SimpleMessenger.o common.o
${CC} ${CFLAGS} ${LIBS} $^ -o $@
-cosd: cosd.cc osd.o ebofs.o ${OSBDB_OBJ} msg/SimpleMessenger.o common.o
- ${CC} ${CFLAGS} ${LIBS} ${OSBDB_LIBS} $^ -o $@
+cosd: cosd.cc osd.o ebofs.o msg/SimpleMessenger.o common.o
+ ${CC} ${CFLAGS} ${LIBS} $^ -o $@
cmds: cmds.cc mds.o osdc.o msg/SimpleMessenger.o common.o
${CC} ${CFLAGS} ${LIBS} $^ -o $@
csyn: csyn.cc client.o osdc.o msg/SimpleMessenger.o common.o
${CC} ${CFLAGS} ${LIBS} $^ -o $@
-cfuse: cfuse.cc client.o osdc.o client/fuse.o msg/SimpleMessenger.o common.o
+cfuse: cfuse.cc client.o osdc.o client/fuse.o client/fuse_ll.o msg/SimpleMessenger.o common.o
${CC} ${CFLAGS} ${LIBS} -lfuse $^ -o $@
-# misc
-gprof-helper.so: test/gprof-helper.c
- gcc -shared -fPIC test/gprof-helper.c -o gprof-helper.so -lpthread -ldl
-
-
# fake*
-fakefuse: fakefuse.cc mon.o mds.o client.o osd.o osdc.o ebofs.o ${OSBDB_OBJ} client/fuse.o msg/FakeMessenger.o common.o
- ${CC} -pg ${CFLAGS} ${LIBS} ${OSBDB_LIBS} -lfuse $^ -o $@
+fakefuse: fakefuse.cc mon.o mds.o client.o osd.o osdc.o ebofs.o client/fuse.o client/fuse_ll.o msg/FakeMessenger.o common.o
+ ${CC} -pg ${CFLAGS} ${LIBS} -lfuse $^ -o $@
-fakesyn: fakesyn.cc mon.o mds.o client.o osd.o ebofs.o ${OSBDB_OBJ} osdc.o msg/FakeMessenger.o common.o
- ${CC} -pg ${CFLAGS} ${LIBS} ${OSBDB_LIBS} $^ -o $@
+fakesyn: fakesyn.cc mon.o mds.o client.o osd.o ebofs.o osdc.o msg/FakeMessenger.o common.o
+ ${CC} -pg ${CFLAGS} ${LIBS} $^ -o $@
# mpi startup
-newsyn: newsyn.cc mon.o mds.o client.o osd.o ebofs.o ${OSBDB_OBJ} osdc.o msg/SimpleMessenger.o common.o
- ${MPICC} -pg ${MPICFLAGS} ${MPILIBS} ${OSBDB_LIBS} $^ -o $@
-
-newsyn.nopg: newsyn.cc mon.o mds.o client.o osd.o ebofs.o ${OSBDB_OBJ} osdc.o msg/SimpleMessenger.o common.o
- ${MPICC} ${MPICFLAGS} ${MPILIBS} ${OSBDB_LIBS} $^ -o $@
+newsyn: newsyn.cc mon.o mds.o client.o osd.o ebofs.o osdc.o msg/SimpleMessenger.o common.o
+ ${MPICC} -pg ${MPICFLAGS} ${MPILIBS} $^ -o $@
# ebofs
libceph.o: client/ldceph.o client/Client.o msg/SimpleMessenger.o ${COMMON_OBJS} ${SYN_OBJS} ${OSDC_OBJS}
${LDINC} $^ -o $@
+# some benchmarking tools
bench/mdtest/mdtest.o: bench/mdtest/mdtest.c
mpicc -c $^ -o $@
mdtest.ceph: bench/mdtest/mdtest.o libceph.o
${MPICC} ${MPICFLAGS} ${MPILIBS} $^ -o $@
-# OSD test
-
testos: test/testos.o ebofs.o osbdb.o common.o
${CC} ${CFLAGS} ${LIBS} ${OSBDB_LIBS} -o $@ $^
-#
-
-%.so: %.cc
- ${CC} -shared -fPIC ${CFLAGS} $< -o $@
-
-clean:
- rm -f *.o */*.o ${TARGETS} ${TEST_TARGETS}
+# bits
common.o: ${COMMON_OBJS}
${LDINC} $@ $^
osbdb.o: ${OSBDB_OBJS}
${LDINC} $@ $^
+
+# generic rules
+%.so: %.cc
+ ${CC} -shared -fPIC ${CFLAGS} $< -o $@
+
%.o: %.cc
${CC} -fPIC ${CFLAGS} -c $< -o $@
%.po: %.cc
${CC} -fPIC ${CFLAGS} -c $< -o $@
+
+# handy
+clean:
+ rm -f *.o */*.o ${TARGETS} ${TEST_TARGETS}
+
count:
cat ${SRCS} | wc -l
cat ${SRCS} | grep -c \;
$(RM) .depend
makedepend -f- -- $(CFLAGS) -- $(SRCS) > .depend 2>/dev/null
+
# now add a line to include the dependency list.
include .depend
+
code cleanup
- endian portability
- word size
- the split/merge plan:
-/ - fragset_t to describe bounds; we need to tolerate concurrent merge/splits
-
-/ - fragtree_t
-/ - get_leaves(fg, ls) needs to be smarter
-/ - force_to_leaf()
-/ - simplified/normalized form.
-
-/ - CDir is never request pinned
-/ - add a CInode sticky_dir flag to somehow pin all cdirs on the fly.
-/ - STICKY dir state and pin? make sure it's kept across import/export/fragment
-/ - pull _bound maps out of Migrator; they are redundant (trust the subtree map!)
-
- - handle_resolve needs to infer splits/merges
-
- - auth journals and applies update in the request update pipeline
- - dirfragtree is lazily consistent. no lock. bcast by primary when it updates.
-
- - bcast to dir replicas
- - inode auth will journal inode update separately/lazily
- - also on handle_resolve(), if there is a mismatch.
- - do i need a fragtrace_t something to tell me where the splits for a given frag occurred?
- - or something like a fragtree_t simplify()?
- - is there any reason to freeze the dir?
- - CDentry objects will be moved to the new frag(s)
- - Server etc. must take care not to carry CDir pointers around; they're unstable!
-
-
-- journal epoch, or something similar
- - reduce size of EMetaBlob by skipping context when inode was already journaled since the last
- SubtreeMap
-
-
- hmm, should we move ESubtreeMap out of the journal?
that would avoid all the icky weirdness in shutdown, with periodic logging, etc.
- need to export stray crap to another mds..
- verify stray is empty on shutdown
-- dirfrag split/merge
- - client readdir for dirfrags
- consistency points/snapshots
- dentry versions vs dirfrags...
-- statfs?
- more testing of failures + thrashing.
- is export prep dir open deadlock properly fixed by forge_replica_dir()?
- failures during recovery stages (resolve, rejoin)... make sure rejoin still works!
-- dirfrag split
- - make sure we are freezing _before_ we fetch to complete the dirfrag, else
- we break commit()'s preconditions when it fetches an incomplete dir.
-
- detect and deal with client failure
- failure during reconnect vs clientmap. although probalby the whole thing needs a larger overhaul...
- crush tools
-rados+ebofs
-- purge replicated writes from cache. (with exception of partial tail blocks.)
-
-rados paper todo?
-- better experiments
- - berkeleydb objectstore?
-- flush log only in response to subsequent read or write?
-- better behaving recovery
-- justify use of splay.
- - dynamic replication
-- snapshots
-
rados snapshots
- integrate revisions into ObjectCacher
- clean up oid.rev vs op.rev in osd+osdc
- rollback
- rollback logging (to fix slow prepare vs rollback race)
- read+floor_lockout for clean STOGITH-like/fencing semantics after failover.
-- efficiently replicate clone() objects
-- flag missing log entries on crash recovery --> WRNOOP? or WRLOST?
+
- consider implications of nvram writeahead logs
+- clean shutdown?
+- pgmonitor should supplement failure detection
+
+- flag missing log entries on crash recovery --> WRNOOP? or WRLOST?
+
+- efficiently replicate clone() objects
- fix heartbeat wrt new replication
- mark residual pgs obsolete ???
- rdlocks
- optimize remove wrt recovery pushes
-- pg_num changes
- report crashed pgs?
messenger
- exponential backoff on monitor resend attempts (actually, this should go outside the messenger!)
objectcacher
+- merge clean bh's
- ocacher caps transitions vs locks
- test read locks
- osdmonitor, filter
ebofs
+- allow holes
+
- verify proper behavior of conflicting/overlapping reads of clones
-- test(fix) sync()
- combine inodes and/or cnodes into same blocks
- allow btree sets instead of maps
- eliminate nodepools
- nonblocking write on missing onodes?
- fix bug in node rotation on insert (and reenable)
- fix NEAR_LAST_FWD (?)
-- journaling? in NVRAM?
-- metadata in nvram? flash?
+
+- awareness of underlying software/hardware raid in allocator so that we
+ write full stripes _only_.
+ - hmm, that's basically just a large block size.
+
+- rewrite the btree code!
+ - multithreaded
+ - eliminate nodepools
+ - allow btree sets
+ - allow arbitrary embedded data?
+ - allow arbitrary btrees
+ - allow root node(s?) to be embedded in onode, or whereever.
+ - keys and values can be uniform (fixed-size) or non-uniform.
+ - fixed size (if any) is a value in the btree struct.
+ - negative indicates bytes of length value? (1 -> 255bytes, 2 -> 65535 bytes, etc.?)
+ - non-uniform records preceeded by length.
+ - keys sorted via a comparator defined in btree root.
+ - lexicographically, by default.
+
+- goal
+ - object btree key->value payload, not just a data blob payload.
+ - better threading behavior.
+ - with transactional goodness!
+
+- onode
+ - object attributes.. as a btree?
+ - blob stream
+ - map stream.
+ - allow blob values.
+
+ -
+
remaining hard problems
mds
- distributed client management
-- anchormgr
- - 2pc
- - independent journal?
- - distributed?
-- link count management
- - also 2pc
- chdir (directory opens!)
- rewrite logstream
- clean up
#include "client/Client.h"
#include "client/fuse.h"
+#include "client/fuse_ll.h"
#include "msg/SimpleMessenger.h"
client->mount();
cerr << "starting fuse on pid " << getpid() << endl;
- ceph_fuse_main(client, argc, argv);
+ if (g_conf.fuse_ll)
+ ceph_fuse_ll_main(client, argc, argv);
+ else
+ ceph_fuse_main(client, argc, argv);
cerr << "fuse finished on pid " << getpid() << endl;
client->unmount();
#include "common/Logger.h"
+
#include "config.h"
#undef dout
-#define dout(l) if (l<=g_conf.debug || l <= g_conf.debug_client) cout << g_clock.now() << " client" << whoami << "." << pthread_self() << " "
+#define dout(l) if (l<=g_conf.debug || l <= g_conf.debug_client) cout << g_clock.now() << " client" << whoami /*<< "." << pthread_self() */ << " "
-#define tout if (g_conf.client_trace) cout << "trace: "
+#define tout if (g_conf.client_trace) traceout
// static logger
//
root = 0;
- set_cache_size(g_conf.client_cache_size);
+ lru.lru_set_max(g_conf.client_cache_size);
// file handles
- free_fh_set.insert(10, 1<<30);
+ free_fd_set.insert(10, 1<<30);
// set up messengers
messenger = m;
void Client::tear_down_cache()
{
- // fh's
- for (hash_map<fh_t, Fh*>::iterator it = fh_map.begin();
- it != fh_map.end();
+ // fd's
+ for (hash_map<int, Fh*>::iterator it = fd_map.begin();
+ it != fd_map.end();
it++) {
Fh *fh = it->second;
dout(1) << "tear_down_cache forcing close of fh " << it->first << " ino " << fh->inode->inode.ino << endl;
put_inode(fh->inode);
delete fh;
}
- fh_map.clear();
+ fd_map.clear();
// caps!
// *** FIXME ***
}
-void Client::init() {
+void Client::init()
+{
+
}
Dentry *dn = (Dentry*)lru.lru_expire();
if (!dn) break; // done
- //dout(10) << "trim_cache unlinking dn " << dn->name << " in dir " << hex << dn->dir->inode->inode.ino << endl;
+ dout(15) << "trim_cache unlinking dn " << dn->name
+ << " in dir " << hex << dn->dir->parent_inode->inode.ino
+ << endl;
unlink(dn);
}
// hose root?
- if (lru.lru_get_size() == 0 && root && inode_map.size() == 1) {
+ if (lru.lru_get_size() == 0 && root && root->ref == 0 && inode_map.size() == 1) {
+ dout(15) << "trim_cache trimmed root " << root << endl;
delete root;
root = 0;
inode_map.clear();
dout(12) << "insert_inode " << dname << " ino " << st->inode.ino
<< " size " << st->inode.size
<< " mtime " << st->inode.mtime
+ << " mask " << st->mask
+ << " in dir " << dir->parent_inode->inode.ino
<< endl;
if (dn) {
dout(12) << " new dentry+node with ino " << st->inode.ino << endl;
} else {
// actually update info
- dout(12) << " stat inode mask is " << st->inode.mask << endl;
- dn->inode->inode = st->inode;
+ dout(12) << " stat inode mask is " << st->mask << endl;
+ if (st->mask & STAT_MASK_BASE) {
+ dn->inode->inode = st->inode;
+ dn->inode->dirfragtree = st->dirfragtree; // FIXME look at the mask!
+ }
// ...but don't clobber our mtime, size!
- if ((dn->inode->inode.mask & INODE_MASK_SIZE) == 0 &&
+ /* isn't this handled below?
+ if ((dn->inode->mask & STAT_MASK_SIZE) == 0 &&
dn->inode->file_wr_size > dn->inode->inode.size)
dn->inode->inode.size = dn->inode->file_wr_size;
- if ((dn->inode->inode.mask & INODE_MASK_MTIME) == 0 &&
+ if ((dn->inode->mask & STAT_MASK_MTIME) == 0 &&
dn->inode->file_wr_mtime > dn->inode->inode.mtime)
dn->inode->inode.mtime = dn->inode->file_wr_mtime;
+ */
}
// OK, we found it!
assert(dn && dn->inode);
+
+ // save the mask
+ dn->inode->mask = st->mask;
// or do we have newer size/mtime from writing?
if (dn->inode->file_caps() & CAP_FILE_WR) {
}
// symlink?
- if ((dn->inode->inode.mode & INODE_TYPE_MASK) == INODE_MODE_SYMLINK) {
+ if (dn->inode->inode.is_symlink()) {
if (!dn->inode->symlink)
dn->inode->symlink = new string;
*(dn->inode->symlink) = st->symlink;
if (!root) {
// create
cur = root = new Inode((*pin)->inode, objectcacher);
+ dout(10) << "insert_trace new root is " << root << endl;
inode_map[root->inode.ino] = root;
}
} else {
// not root.
- dout(10) << "insert_trace dn " << *pdn << " ino " << (*pin)->inode.ino << endl;
Dir *dir = cur->open_dir();
cur = this->insert_inode(dir, *pin, *pdn);
+ dout(10) << "insert_trace dn " << *pdn << " ino " << (*pin)->inode.ino << " -> " << cur << endl;
++pdn;
// move to top of lru!
dout(10) << "chose target mds" << mds << " based on hierarchy" << endl;
} else {
mds = mdsmap->get_random_in_mds();
+ if (mds < 0) mds = 0; // hrm.
dout(10) << "chose random target mds" << mds << " for lack of anything better" << endl;
}
}
<< " and mdsmap " << mdsmap->get_epoch()
<< endl;
+ /*
+ // hack: get+pin root inode
+ Inode *root;
+ _do_lstat("/", STAT_MASK_ALL, &root);
+ _ll_get(root);
+ */
+
+ // trace?
+ if (g_conf.client_trace) {
+ traceout.open(g_conf.client_trace);
+ if (traceout.is_open()) {
+ dout(1) << "opened trace file '" << g_conf.client_trace << "'" << endl;
+ } else {
+ dout(1) << "FAILED to open trace file '" << g_conf.client_trace << "'" << endl;
+ }
+ }
+
client_lock.Unlock();
/*
dout(3) << "op: int readlinkbuf_len = 1000;" << endl;
dout(3) << "op: char readlinkbuf[readlinkbuf_len];" << endl;
dout(3) << "op: map<string, inode_t*> dir_contents;" << endl;
- dout(3) << "op: map<fh_t, fh_t> open_files;" << endl;
- dout(3) << "op: fh_t fh;" << endl;
+ dout(3) << "op: map<int, int> open_files;" << endl;
+ dout(3) << "op: int fd;" << endl;
*/
return 0;
}
unmounting = true;
// NOTE: i'm assuming all caches are already flushing (because all files are closed).
- assert(fh_map.empty());
+ assert(fd_map.empty());
+
+ dout(10) << "a" << endl;
+
+ _ll_drop_pins();
+ dout(10) << "b" << endl;
+
// empty lru cache
lru.lru_set_max(0);
trim_cache();
!inode_map.empty()) {
dout(2) << "cache still has " << lru.lru_get_size()
<< "+" << inode_map.size() << " items"
- << ", waiting (presumably for safe or for caps to be released?)"
+ << ", waiting (for caps to release?)"
<< endl;
dump_cache();
mount_cond.Wait(client_lock);
}
assert(lru.lru_get_size() == 0);
assert(inode_map.empty());
-
+
// unsafe writes
if (!g_conf.client_oc) {
while (unsafe_sync_write > 0) {
mount_cond.Wait(client_lock);
}
}
+
+ // stop tracing
+ if (g_conf.client_trace) {
+ dout(1) << "closing trace file '" << g_conf.client_trace << "'" << endl;
+ traceout.close();
+ }
+
// send session closes!
for (map<int,version_t>::iterator p = mds_sessions.begin();
}
+// ===============================================================
+// high level (POSIXy) interface
+
// namespace ops
int Client::link(const char *existing, const char *newname)
{
- client_lock.Lock();
- dout(3) << "op: client->link(\"" << existing << "\", \"" << newname << "\");" << endl;
+ Mutex::Locker lock(client_lock);
tout << "link" << endl;
tout << existing << endl;
tout << newname << endl;
+ return _link(existing, newname);
+}
-
+int Client::_link(const char *existing, const char *newname)
+{
// main path arg is new link name
// sarg is target (existing file)
-
MClientRequest *req = new MClientRequest(MDS_OP_LINK, messenger->get_myinst());
req->set_path(newname);
req->set_sarg(existing);
dout(10) << "link result is " << res << endl;
trim_cache();
- client_lock.Unlock();
+ dout(3) << "link(\"" << existing << "\", \"" << newname << "\") = " << res << endl;
return res;
}
int Client::unlink(const char *relpath)
{
- client_lock.Lock();
+ Mutex::Locker lock(client_lock);
+ tout << "unlink" << endl;
+ tout << relpath << endl;
string abspath;
mkabspath(relpath, abspath);
- const char *path = abspath.c_str();
-
- dout(3) << "op: client->unlink\(\"" << path << "\");" << endl;
- tout << "unlink" << endl;
- tout << path << endl;
+ return _unlink(abspath.c_str());
+}
+int Client::_unlink(const char *path)
+{
MClientRequest *req = new MClientRequest(MDS_OP_UNLINK, messenger->get_myinst());
req->set_path(path);
dout(10) << "unlink result is " << res << endl;
trim_cache();
- client_lock.Unlock();
+ dout(3) << "unlink(\"" << path << "\") = " << res << endl;
return res;
}
int Client::rename(const char *relfrom, const char *relto)
{
- client_lock.Lock();
+ Mutex::Locker lock(client_lock);
+ tout << "rename" << endl;
+ tout << relfrom << endl;
+ tout << relto << endl;
- string absfrom;
+ string absfrom, absto;
mkabspath(relfrom, absfrom);
- const char *from = absfrom.c_str();
- string absto;
mkabspath(relto, absto);
- const char *to = absto.c_str();
-
- dout(3) << "op: client->rename(\"" << from << "\", \"" << to << "\");" << endl;
- tout << "rename" << endl;
- tout << from << endl;
- tout << to << endl;
-
+ return _rename(absfrom.c_str(), absto.c_str());
+}
+int Client::_rename(const char *from, const char *to)
+{
MClientRequest *req = new MClientRequest(MDS_OP_RENAME, messenger->get_myinst());
req->set_path(from);
req->set_sarg(to);
delete reply;
dout(10) << "rename result is " << res << endl;
+ // renamed item from our cache
+
trim_cache();
- client_lock.Unlock();
+ dout(3) << "rename(\"" << from << "\", \"" << to << "\") = " << res << endl;
return res;
}
int Client::mkdir(const char *relpath, mode_t mode)
{
- client_lock.Lock();
-
- string abspath;
- mkabspath(relpath, abspath);
- const char *path = abspath.c_str();
-
- dout(3) << "op: client->mkdir(\"" << path << "\", " << mode << ");" << endl;
+ Mutex::Locker lock(client_lock);
tout << "mkdir" << endl;
- tout << path << endl;
+ tout << relpath << endl;
tout << mode << endl;
+ string abspath;
+ mkabspath(relpath, abspath);
+ return _mkdir(abspath.c_str(), mode);
+}
+int Client::_mkdir(const char *path, mode_t mode)
+{
MClientRequest *req = new MClientRequest(MDS_OP_MKDIR, messenger->get_myinst());
req->set_path(path);
req->args.mkdir.mode = mode;
dout(10) << "mkdir result is " << res << endl;
trim_cache();
- client_lock.Unlock();
+
+ dout(3) << "mkdir(\"" << path << "\", 0" << oct << mode << dec << ") = " << res << endl;
return res;
}
int Client::rmdir(const char *relpath)
{
- client_lock.Lock();
+ Mutex::Locker lock(client_lock);
+ tout << "rmdir" << endl;
+ tout << relpath << endl;
string abspath;
mkabspath(relpath, abspath);
- const char *path = abspath.c_str();
-
- dout(3) << "op: client->rmdir(\"" << path << "\");" << endl;
- tout << "rmdir" << endl;
- tout << path << endl;
-
+ return _rmdir(abspath.c_str());
+}
+int Client::_rmdir(const char *path)
+{
MClientRequest *req = new MClientRequest(MDS_OP_RMDIR, messenger->get_myinst());
req->set_path(path);
}
insert_trace(reply);
delete reply;
- dout(10) << "rmdir result is " << res << endl;
trim_cache();
- client_lock.Unlock();
+ dout(3) << "rmdir(\"" << path << "\") = " << res << endl;
return res;
}
int Client::symlink(const char *reltarget, const char *rellink)
{
- client_lock.Lock();
-
- string abstarget;
- mkabspath(reltarget, abstarget);
- const char *target = abstarget.c_str();
- string abslink;
- mkabspath(rellink, abslink);
- const char *link = abslink.c_str();
-
- dout(3) << "op: client->symlink(\"" << target << "\", \"" << link << "\");" << endl;
+ Mutex::Locker lock(client_lock);
tout << "symlink" << endl;
- tout << target << endl;
- tout << link << endl;
+ tout << reltarget << endl;
+ tout << rellink << endl;
+ string target, link;
+ mkabspath(reltarget, target);
+ mkabspath(rellink, link);
+ return _symlink(target.c_str(), link.c_str());
+}
+int Client::_symlink(const char *target, const char *link)
+{
MClientRequest *req = new MClientRequest(MDS_OP_SYMLINK, messenger->get_myinst());
req->set_path(link);
req->set_sarg(target);
int res = reply->get_result();
insert_trace(reply); //FIXME assuming trace of link, not of target
delete reply;
- dout(10) << "symlink result is " << res << endl;
trim_cache();
- client_lock.Unlock();
+ dout(3) << "symlink(\"" << target << "\", \"" << link << "\") = " << res << endl;
return res;
}
-int Client::readlink(const char *relpath, char *buf, off_t size)
-{
- client_lock.Lock();
-
- string abspath;
- mkabspath(relpath, abspath);
- const char *path = abspath.c_str();
-
- dout(3) << "op: client->readlink(\"" << path << "\", readlinkbuf, readlinkbuf_len);" << endl;
+int Client::readlink(const char *path, char *buf, off_t size)
+{
+ Mutex::Locker lock(client_lock);
tout << "readlink" << endl;
tout << path << endl;
- client_lock.Unlock();
- // stat first (FIXME, PERF access cache directly) ****
- struct stat stbuf;
- int r = this->lstat(path, &stbuf);
- if (r != 0) return r;
-
- client_lock.Lock();
-
- // pull symlink content from cache
- Inode *in = inode_map[stbuf.st_ino];
- assert(in); // i just did a stat
-
- // copy into buf (at most size bytes)
- unsigned res = in->symlink->length();
- if (res > size) res = size;
- memcpy(buf, in->symlink->c_str(), res);
+ string abspath;
+ mkabspath(path, abspath);
+ return _readlink(abspath.c_str(), buf, size);
+}
+int Client::_readlink(const char *path, char *buf, off_t size)
+{
+ Inode *in;
+ int r = _do_lstat(path, STAT_MASK_BASE, &in);
+ if (r == 0 && !in->inode.is_symlink()) r = -EINVAL;
+ if (r == 0) {
+ // copy into buf (at most size bytes)
+ r = in->symlink->length();
+ if (r > size) r = size;
+ memcpy(buf, in->symlink->c_str(), r);
+ } else {
+ buf[0] = 0;
+ }
trim_cache();
- client_lock.Unlock();
- return res; // return length in bytes (to mimic the system call)
+
+ dout(3) << "readlink(\"" << path << "\", \"" << buf << "\", " << size << ") = " << r << endl;
+ return r;
}
// inode stuff
-int Client::_lstat(const char *path, int mask, Inode **in)
+int Client::_do_lstat(const char *path, int mask, Inode **in)
{
MClientRequest *req = 0;
filepath fpath(path);
Dentry *dn = lookup(fpath);
inode_t inode;
utime_t now = g_clock.real_now();
+
if (dn &&
- now <= dn->inode->valid_until &&
- ((dn->inode->inode.mask & INODE_MASK_ALL_STAT) == INODE_MASK_ALL_STAT)) {
+ now <= dn->inode->valid_until)
+ dout(10) << "_lstat has inode " << path << " with mask " << dn->inode->mask << ", want " << mask << endl;
+
+ if (dn && dn->inode &&
+ ((mask & ~STAT_MASK_BASE) || now <= dn->inode->valid_until) &&
+ ((dn->inode->mask & mask) == mask)) {
inode = dn->inode->inode;
- dout(10) << "lstat cache hit w/ sufficient inode.mask, valid until " << dn->inode->valid_until << endl;
+ dout(10) << "lstat cache hit w/ sufficient mask, valid until " << dn->inode->valid_until << endl;
if (g_conf.client_cache_stat_ttl == 0)
dn->inode->valid_until = utime_t(); // only one stat allowed after each readdir
}
-void Client::fill_stat(inode_t& inode, struct stat *st)
+int Client::fill_stat(Inode *in, struct stat *st)
{
+ dout(10) << "fill_stat on " << in->inode.ino << " mode 0" << oct << in->inode.mode << dec
+ << " mtime " << in->inode.mtime << " ctime " << in->inode.ctime << endl;
memset(st, 0, sizeof(struct stat));
- st->st_ino = inode.ino;
- st->st_mode = inode.mode;
- st->st_nlink = inode.nlink;
- st->st_uid = inode.uid;
- st->st_gid = inode.gid;
- st->st_ctime = MAX(inode.ctime, inode.mtime);
- st->st_atime = inode.atime;
- st->st_mtime = inode.mtime;
- st->st_size = inode.size;
- st->st_blocks = inode.size ? ((inode.size - 1) / 4096 + 1):0;
+ st->st_ino = in->inode.ino;
+ st->st_mode = in->inode.mode;
+ st->st_rdev = in->inode.rdev;
+ st->st_nlink = in->inode.nlink;
+ st->st_uid = in->inode.uid;
+ st->st_gid = in->inode.gid;
+ st->st_ctime = MAX(in->inode.ctime, in->inode.mtime);
+ st->st_atime = in->inode.atime;
+ st->st_mtime = in->inode.mtime;
+ st->st_size = in->inode.size;
+ st->st_blocks = in->inode.size ? ((in->inode.size - 1) / 4096 + 1):0;
st->st_blksize = 4096;
+ return in->mask;
}
-void Client::fill_statlite(inode_t& inode, struct statlite *st)
-{
- memset(st, 0, sizeof(struct stat));
- st->st_ino = inode.ino;
- st->st_mode = inode.mode;
- st->st_nlink = inode.nlink;
- st->st_uid = inode.uid;
- st->st_gid = inode.gid;
-#ifndef DARWIN
- // FIXME what's going on here with darwin?
- st->st_ctime = MAX(inode.ctime, inode.mtime);
- st->st_atime = inode.atime;
- st->st_mtime = inode.mtime;
-#endif
- st->st_size = inode.size;
- st->st_blocks = inode.size ? ((inode.size - 1) / 4096 + 1):0;
- st->st_blksize = 4096;
-
/*
S_REQUIREBLKSIZE(st->st_litemask);
if (inode.mask & INODE_MASK_BASE) S_REQUIRECTIME(st->st_litemask);
if (inode.mask & INODE_MASK_MTIME) S_REQUIREMTIME(st->st_litemask);
if (inode.mask & INODE_MASK_ATIME) S_REQUIREATIME(st->st_litemask);
*/
-}
int Client::lstat(const char *relpath, struct stat *stbuf)
{
- client_lock.Lock();
+ Mutex::Locker lock(client_lock);
+ tout << "lstat" << endl;
+ tout << relpath << endl;
string abspath;
mkabspath(relpath, abspath);
- const char *path = abspath.c_str();
-
- dout(3) << "op: client->lstat(\"" << path << "\", &st);" << endl;
- tout << "lstat" << endl;
- tout << path << endl;
+ return _lstat(abspath.c_str(), stbuf);
+}
+int Client::_lstat(const char *path, struct stat *stbuf)
+{
Inode *in = 0;
-
- int res = _lstat(path, INODE_MASK_ALL_STAT, &in);
+ int res = _do_lstat(path, STAT_MASK_ALL, &in);
if (res == 0) {
assert(in);
- fill_stat(in->inode,stbuf);
- dout(10) << "stat sez size = " << in->inode.size << " mode = " << oct << stbuf->st_mode << dec << " ino = " << stbuf->st_ino << endl;
+ fill_stat(in, stbuf);
+ dout(10) << "stat sez size = " << in->inode.size << " mode = 0" << oct << stbuf->st_mode << dec << " ino = " << stbuf->st_ino << endl;
}
trim_cache();
- client_lock.Unlock();
+ dout(3) << "lstat(\"" << path << "\", " << stbuf << ") = " << res << endl;
return res;
}
+/*
int Client::lstatlite(const char *relpath, struct statlite *stl)
{
client_lock.Lock();
client_lock.Unlock();
return res;
}
-
+*/
int Client::chmod(const char *relpath, mode_t mode)
{
- client_lock.Lock();
-
- string abspath;
- mkabspath(relpath, abspath);
- const char *path = abspath.c_str();
-
- dout(3) << "op: client->chmod(\"" << path << "\", " << mode << ");" << endl;
+ Mutex::Locker lock(client_lock);
tout << "chmod" << endl;
- tout << path << endl;
+ tout << relpath << endl;
tout << mode << endl;
+ string abspath;
+ mkabspath(relpath, abspath);
+ return _chmod(abspath.c_str(), mode);
+}
+int Client::_chmod(const char *path, mode_t mode)
+{
+ dout(3) << "_chmod(" << path << ", 0" << oct << mode << dec << ")" << endl;
MClientRequest *req = new MClientRequest(MDS_OP_CHMOD, messenger->get_myinst());
req->set_path(path);
req->args.chmod.mode = mode;
int res = reply->get_result();
insert_trace(reply);
delete reply;
- dout(10) << "chmod result is " << res << endl;
trim_cache();
- client_lock.Unlock();
+ dout(3) << "_chmod(\"" << path << "\", 0" << oct << mode << dec << ") = " << res << endl;
return res;
}
int Client::chown(const char *relpath, uid_t uid, gid_t gid)
{
- client_lock.Lock();
-
- string abspath;
- mkabspath(relpath, abspath);
- const char *path = abspath.c_str();
-
- dout(3) << "op: client->chown(\"" << path << "\", " << uid << ", " << gid << ");" << endl;
+ Mutex::Locker lock(client_lock);
tout << "chown" << endl;
- tout << path << endl;
+ tout << relpath << endl;
tout << uid << endl;
tout << gid << endl;
+ string abspath;
+ mkabspath(relpath, abspath);
+ return _chown(abspath.c_str(), uid, gid);
+}
+int Client::_chown(const char *path, uid_t uid, gid_t gid)
+{
+ dout(3) << "_chown(" << path << ", " << uid << ", " << gid << ")" << endl;
MClientRequest *req = new MClientRequest(MDS_OP_CHOWN, messenger->get_myinst());
req->set_path(path);
req->args.chown.uid = uid;
dout(10) << "chown result is " << res << endl;
trim_cache();
- client_lock.Unlock();
+ dout(3) << "chown(\"" << path << "\", " << uid << ", " << gid << ") = " << res << endl;
return res;
}
int Client::utime(const char *relpath, struct utimbuf *buf)
{
- client_lock.Lock();
-
- string abspath;
- mkabspath(relpath, abspath);
- const char *path = abspath.c_str();
-
- dout(3) << "op: utim.actime = " << buf->actime << "; utim.modtime = " << buf->modtime << ";" << endl;
- dout(3) << "op: client->utime(\"" << path << "\", &utim);" << endl;
+ Mutex::Locker lock(client_lock);
tout << "utime" << endl;
- tout << path << endl;
- tout << buf->actime << endl;
+ tout << relpath << endl;
tout << buf->modtime << endl;
+ tout << buf->actime << endl;
+ string abspath;
+ mkabspath(relpath, abspath);
+ return _utimes(abspath.c_str(), utime_t(buf->modtime,0), utime_t(buf->actime,0));
+}
+int Client::_utimes(const char *path, utime_t mtime, utime_t atime)
+{
+ dout(3) << "_utimes(" << path << ", " << mtime << ", " << atime << ")" << endl;
MClientRequest *req = new MClientRequest(MDS_OP_UTIME, messenger->get_myinst());
req->set_path(path);
- req->args.utime.mtime.tv_sec = buf->modtime;
- req->args.utime.mtime.tv_usec = 0;
- req->args.utime.atime.tv_sec = buf->actime;
- req->args.utime.atime.tv_usec = 0;
+ req->args.utime.mtime = mtime.tv_ref();
+ req->args.utime.atime = atime.tv_ref();
// FIXME where does FUSE maintain user information
req->set_caller_uid(getuid());
req->set_caller_gid(getgid());
- //FIXME enforce caller uid rights?
-
MClientReply *reply = make_request(req);
int res = reply->get_result();
insert_trace(reply);
delete reply;
- dout(10) << "utime result is " << res << endl;
+ dout(3) << "utimes(\"" << path << "\", " << mtime << ", " << atime << ") = " << res << endl;
trim_cache();
- client_lock.Unlock();
return res;
}
-int Client::mknod(const char *relpath, mode_t mode)
+int Client::mknod(const char *relpath, mode_t mode, dev_t rdev)
{
- client_lock.Lock();
+ Mutex::Locker lock(client_lock);
+ tout << "mknod" << endl;
+ tout << relpath << endl;
+ tout << mode << endl;
+ tout << rdev << endl;
string abspath;
mkabspath(relpath, abspath);
- const char *path = abspath.c_str();
-
- dout(3) << "op: client->mknod(\"" << path << "\", " << mode << ");" << endl;
- tout << "mknod" << endl;
- tout << path << endl;
- tout << mode << endl;
+ return _mknod(abspath.c_str(), mode, rdev);
+}
+int Client::_mknod(const char *path, mode_t mode, dev_t rdev)
+{
+ dout(3) << "_mknod(" << path << ", 0" << oct << mode << dec << ", " << rdev << ")" << endl;
MClientRequest *req = new MClientRequest(MDS_OP_MKNOD, messenger->get_myinst());
req->set_path(path);
req->args.mknod.mode = mode;
+ req->args.mknod.rdev = rdev;
// FIXME where does FUSE maintain user information
req->set_caller_uid(getuid());
int res = reply->get_result();
insert_trace(reply);
- dout(10) << "mknod result is " << res << endl;
-
delete reply;
trim_cache();
- client_lock.Unlock();
+
+ dout(3) << "mknod(\"" << path << "\", 0" << oct << mode << dec << ") = " << res << endl;
return res;
}
-//readdir usually include inode info for each entry except of locked entries
+int Client::getdir(const char *relpath, list<string>& contents)
+{
+ dout(3) << "getdir(" << relpath << ")" << endl;
+ {
+ Mutex::Locker lock(client_lock);
+ tout << "getdir" << endl;
+ tout << relpath << endl;
+ }
-//
-// getdir
+ DIR *d;
+ int r = opendir(relpath, &d);
+ if (r < 0) return r;
-// fyi: typedef int (*dirfillerfunc_t) (void *handle, const char *name, int type, inodeno_t ino);
+ struct dirent de;
+ int n = 0;
+ while (readdir_r(d, &de) == 0) {
+ contents.push_back(de.d_name);
+ n++;
+ }
+ closedir(d);
+
+ return n;
+}
-int Client::getdir(const char *relpath, map<string,inode_t>& contents)
+int Client::opendir(const char *name, DIR **dirpp)
{
- client_lock.Lock();
+ Mutex::Locker lock(client_lock);
+ tout << "opendir" << endl;
+ tout << name << endl;
- string abspath;
- mkabspath(relpath, abspath);
- const char *path = abspath.c_str();
+ int r = _opendir(name, (DirResult**)dirpp);
+ tout << (unsigned long)*dirpp;
+ return r;
+}
- dout(3) << "op: client->getdir(\"" << path << "\", dir_contents);" << endl;
- tout << "getdir" << endl;
- tout << path << endl;
+int Client::_opendir(const char *name, DirResult **dirpp)
+{
+ *dirpp = new DirResult(name);
+
+ // do we have the inode in our cache?
+ // if so, should be we ask for a different dirfrag?
+ filepath path(name);
+ Dentry *dn = lookup(path);
+ if (dn && dn->inode) {
+ (*dirpp)->inode = dn->inode;
+ (*dirpp)->inode->get();
+ dout(10) << "had inode " << dn->inode << " " << dn->inode->inode.ino << " ref now " << dn->inode->ref << endl;
+ (*dirpp)->set_frag(dn->inode->dirfragtree[0]);
+ dout(10) << "_opendir " << name << ", our cache says the first dirfrag is " << (*dirpp)->frag() << endl;
+ }
+ // get the first frag
+ int r = _readdir_get_frag(*dirpp);
+ if (r < 0) {
+ _closedir(*dirpp);
+ *dirpp = 0;
+ }
+ dout(3) << "_opendir(" << name << ") = " << r << " (" << *dirpp << ")" << endl;
- MClientRequest *req = new MClientRequest(MDS_OP_READDIR, messenger->get_myinst());
- req->set_path(path);
+ return r;
+}
+
+void Client::_readdir_add_dirent(DirResult *dirp, const string& name, Inode *in)
+{
+ struct stat st;
+ int stmask = fill_stat(in, &st);
+ frag_t fg = dirp->frag();
+ dirp->buffer[fg].push_back(DirEntry(name, st, stmask));
+ dout(10) << "_readdir_add_dirent added " << name << ", size now " << dirp->buffer[fg].size() << endl;
+}
+
+void Client::_readdir_add_dirent(DirResult *dirp, const string& name, unsigned char d_type)
+{
+ struct stat st;
+ memset(&st, 0, sizeof(st));
+ st.st_mode = DT_TO_MODE(d_type);
+ int stmask = STAT_MASK_TYPE;
+ frag_t fg = dirp->frag();
+ dirp->buffer[fg].push_back(DirEntry(name, st, stmask));
+ dout(10) << "_readdir_add_dirent added " << name << ", size now " << dirp->buffer[fg].size() << endl;
+}
+
+void Client::_readdir_next_frag(DirResult *dirp)
+{
+ frag_t fg = dirp->frag();
+
+ // hose old data
+ assert(dirp->buffer.count(fg));
+ dirp->buffer.erase(fg);
+
+ // advance
+ dirp->next_frag();
+ if (dirp->at_end()) {
+ dout(10) << "_readdir_next_frag advance from " << fg << " to END" << endl;
+ } else {
+ dout(10) << "_readdir_next_frag advance from " << fg << " to " << dirp->frag() << endl;
+ _readdir_rechoose_frag(dirp);
+ }
+}
+
+void Client::_readdir_rechoose_frag(DirResult *dirp)
+{
+ assert(dirp->inode);
+ frag_t cur = dirp->frag();
+ frag_t f = dirp->inode->dirfragtree[cur.value()];
+ if (f != cur) {
+ dout(10) << "_readdir_rechoose_frag frag " << cur << " maps to " << f << endl;
+ dirp->set_frag(f);
+ }
+}
+
+int Client::_readdir_get_frag(DirResult *dirp)
+{
+ // get the current frag.
+ frag_t fg = dirp->frag();
+ assert(dirp->buffer.count(fg) == 0);
+
+ dout(10) << "_readdir_get_frag " << dirp << " on " << dirp->path << " fg " << fg << endl;
+ MClientRequest *req = new MClientRequest(MDS_OP_READDIR, messenger->get_myinst());
+ req->set_path(dirp->path);
+ req->args.readdir.frag = fg;
+
// FIXME where does FUSE maintain user information
req->set_caller_uid(getuid());
req->set_caller_gid(getgid());
-
- //FIXME enforce caller uid rights?
-
+
MClientReply *reply = make_request(req);
int res = reply->get_result();
insert_trace(reply);
+ inodeno_t ino = reply->get_ino();
+
+ // did i get directory inode?
+ Inode *diri = 0;
+ if ((res == -EAGAIN || res == 0) &&
+ inode_map.count(ino)) {
+ diri = inode_map[ino];
+ dout(10) << "_readdir_get_frag got diri " << diri << " " << diri->inode.ino << endl;
+ assert(diri);
+ assert(diri->inode.mode & INODE_MODE_DIR);
+ }
+
+ if (!dirp->inode && diri) {
+ dout(10) << "_readdir_get_frag attaching inode" << endl;
+ dirp->inode = inode_map[ino];
+ diri->get();
+ }
- if (res == 0) {
+ if (res == -EAGAIN) {
+ dout(10) << "_readdir_get_frag got EAGAIN, retrying" << endl;
+ _readdir_rechoose_frag(dirp);
+ return _readdir_get_frag(dirp);
+ }
- // dir contents to cache!
- inodeno_t ino = reply->get_ino();
- Inode *diri = inode_map[ ino ];
+ if (res == 0) {
+ // stuff dir contents to cache, DirResult
assert(diri);
- assert(diri->inode.mode & INODE_MODE_DIR);
- // add . and ..?
- string dot(".");
- contents[dot] = diri->inode;
- if (diri != root) {
+ // create empty result vector
+ dirp->buffer[fg].clear();
+
+ if (fg.is_leftmost()) {
+ // add . and ..?
+ string dot(".");
+ _readdir_add_dirent(dirp, dot, diri);
string dotdot("..");
- contents[dotdot] = diri->dn->dir->parent_inode->inode;
+ if (diri->dn)
+ _readdir_add_dirent(dirp, dotdot, diri->dn->dir->parent_inode);
+ //else
+ //_readdir_add_dirent(dirp, dotdot, DT_DIR);
}
-
+
// the rest?
- if (!reply->get_dir_in().empty()) {
+ if (!reply->get_dir_dn().empty()) {
// only open dir if we're actually adding stuff to it!
Dir *dir = diri->open_dir();
assert(dir);
utime_t now = g_clock.real_now();
- list<string>::const_iterator pdn = reply->get_dir_dn().begin();
- for (list<InodeStat*>::const_iterator pin = reply->get_dir_in().begin();
- pin != reply->get_dir_in().end();
- ++pin, ++pdn) {
- // ignore .
- if (*pdn == ".")
- continue;
-
+ list<InodeStat*>::const_iterator pin = reply->get_dir_in().begin();
+ for (list<string>::const_iterator pdn = reply->get_dir_dn().begin();
+ pdn != reply->get_dir_dn().end();
+ ++pdn, ++pin) {
// count entries
res++;
-
- // put in cache
- Inode *in = this->insert_inode(dir, *pin, *pdn);
-
- if (g_conf.client_cache_stat_ttl) {
- in->valid_until = now;
+
+ // put in cache
+ Inode *in = this->insert_inode(dir, *pin, *pdn);
+
+ if (g_conf.client_cache_stat_ttl) {
+ in->valid_until = now;
in->valid_until += g_conf.client_cache_stat_ttl;
}
- else if (g_conf.client_cache_readdir_ttl) {
- in->valid_until = now;
+ else if (g_conf.client_cache_readdir_ttl) {
+ in->valid_until = now;
in->valid_until += g_conf.client_cache_readdir_ttl;
}
-
- // contents to caller too!
- dout(15) << "getdir including " << *pdn << " to " << in->inode.ino << endl;
- contents[*pdn] = in->inode;
+
+ // contents to caller too!
+ dout(15) << "_readdir_get_frag got " << *pdn << " to " << in->inode.ino << endl;
+ _readdir_add_dirent(dirp, *pdn, in);
}
+
if (dir->is_empty())
close_dir(dir);
}
// FIXME: remove items in cache that weren't in my readdir?
// ***
+ } else {
+ dout(10) << "_readdir_get_frag got error " << res << ", setting end flag" << endl;
+ dirp->set_end();
}
- delete reply; //fix thing above first
+ delete reply;
- client_lock.Unlock();
return res;
}
+int Client::readdir_r(DIR *d, struct dirent *de)
+{
+ return readdirplus_r(d, de, 0, 0);
+}
+
+int Client::readdirplus_r(DIR *d, struct dirent *de, struct stat *st, int *stmask)
+{
+ DirResult *dirp = (DirResult*)d;
+
+ while (1) {
+ if (dirp->at_end()) return -1;
+
+ if (dirp->buffer.count(dirp->frag()) == 0) {
+ Mutex::Locker lock(client_lock);
+ _readdir_get_frag(dirp);
+ if (dirp->at_end()) return -1;
+ }
-/** POSIX stubs **/
+ frag_t fg = dirp->frag();
+ uint32_t pos = dirp->fragpos();
+ assert(dirp->buffer.count(fg));
+ vector<DirEntry> &ent = dirp->buffer[fg];
-DIR *Client::opendir(const char *name)
-{
- DirResult *d = new DirResult;
- d->size = getdir(name, d->contents);
- d->p = d->contents.begin();
- d->off = 0;
- return (DIR*)d;
-}
+ if (ent.empty()) {
+ dout(10) << "empty frag " << fg << ", moving on to next" << endl;
+ _readdir_next_frag(dirp);
+ continue;
+ }
+
+ assert(pos < ent.size());
+ _readdir_fill_dirent(de, &ent[pos], dirp->offset);
+ if (st) *st = ent[pos].st;
+ if (stmask) *stmask = ent[pos].stmask;
+ pos++;
+ dirp->offset++;
+
+ if (pos == ent.size())
+ _readdir_next_frag(dirp);
+
+ break;
+ }
-int Client::closedir(DIR *dir)
-{
- DirResult *d = (DirResult*)dir;
- delete d;
return 0;
}
// unsigned char d_type; /* type of file */
// char d_name[256]; /* filename */
//};
-
-struct dirent *Client::readdir(DIR *dirp)
+void Client::_readdir_fill_dirent(struct dirent *de, DirEntry *entry, off_t off)
{
- DirResult *d = (DirResult*)dirp;
-
- // end of dir?
- if (d->p == d->contents.end())
- return 0;
-
- // fill the dirent
- d->dp.d_dirent.d_ino = d->p->second.ino;
-#ifndef __CYGWIN__
-#ifndef DARWIN
- if (d->p->second.is_symlink())
- d->dp.d_dirent.d_type = DT_LNK;
- else if (d->p->second.is_dir())
- d->dp.d_dirent.d_type = DT_DIR;
- else if (d->p->second.is_file())
- d->dp.d_dirent.d_type = DT_REG;
+ if (entry->stmask)
+ de->d_ino = entry->st.st_ino;
else
- d->dp.d_dirent.d_type = DT_UNKNOWN;
-
- d->dp.d_dirent.d_off = d->off;
- d->dp.d_dirent.d_reclen = 1; // all records are length 1 (wrt offset, seekdir, telldir, etc.)
-#endif // DARWIN
-#endif
+ de->d_ino = 0;
+ de->d_off = off + 1;
+ de->d_reclen = 1;
+ de->d_type = MODE_TO_DT(entry->st.st_mode);
+ strncpy(de->d_name, entry->d_name.c_str(), 256);
+ dout(10) << "_readdir_fill_dirent " << de->d_name << " " << de->d_ino
+ << " type " << (int)de->d_type << " at off " << off << endl;
+}
- strncpy(d->dp.d_dirent.d_name, d->p->first.c_str(), 256);
+int Client::closedir(DIR *dir)
+{
+ Mutex::Locker lock(client_lock);
+ tout << "closedir" << endl;
+ tout << (unsigned long)dir << endl;
- // move up
- ++d->off;
- ++d->p;
+ dout(3) << "closedir(" << dir << ") = 0" << endl;
+ _closedir((DirResult*)dir);
+ return 0;
+}
- return &d->dp.d_dirent;
+void Client::_closedir(DirResult *dirp)
+{
+ dout(10) << "_closedir(" << dirp << ")" << endl;
+ if (dirp->inode) {
+ dout(10) << "_closedir detaching inode " << dirp->inode << endl;
+ put_inode(dirp->inode);
+ dirp->inode = 0;
+ }
+ delete dirp;
}
-
+
void Client::rewinddir(DIR *dirp)
{
+ dout(3) << "rewinddir(" << dirp << ")" << endl;
DirResult *d = (DirResult*)dirp;
- d->p = d->contents.begin();
- d->off = 0;
+ d->offset = 0;
+ d->buffer.clear();
}
off_t Client::telldir(DIR *dirp)
{
DirResult *d = (DirResult*)dirp;
- return d->off;
+ dout(3) << "telldir(" << dirp << ") = " << d->offset << endl;
+ return d->offset;
}
void Client::seekdir(DIR *dirp, off_t offset)
{
+ dout(3) << "seekdir(" << dirp << ", " << offset << ")" << endl;
DirResult *d = (DirResult*)dirp;
-
- d->p = d->contents.begin();
- d->off = 0;
-
- if (offset >= d->size) offset = d->size-1;
- while (offset > 0) {
- ++d->p;
- ++d->off;
- --offset;
- }
-}
-
-struct dirent_plus *Client::readdirplus(DIR *dirp)
-{
- DirResult *d = (DirResult*)dirp;
-
- // end of dir?
- if (d->p == d->contents.end())
- return 0;
-
- // fill the dirent
- d->dp.d_dirent.d_ino = d->p->second.ino;
-#ifndef __CYGWIN__
-#ifndef DARWIN
- if (d->p->second.is_symlink())
- d->dp.d_dirent.d_type = DT_LNK;
- else if (d->p->second.is_dir())
- d->dp.d_dirent.d_type = DT_DIR;
- else if (d->p->second.is_file())
- d->dp.d_dirent.d_type = DT_REG;
- else
- d->dp.d_dirent.d_type = DT_UNKNOWN;
-
- d->dp.d_dirent.d_off = d->off;
- d->dp.d_dirent.d_reclen = 1; // all records are length 1 (wrt offset, seekdir, telldir, etc.)
-#endif // DARWIN
-#endif
-
- strncpy(d->dp.d_dirent.d_name, d->p->first.c_str(), 256);
-
- // plus
- if ((d->p->second.mask & INODE_MASK_ALL_STAT) == INODE_MASK_ALL_STAT) {
- // have it
- fill_stat(d->p->second, &d->dp.d_stat);
- d->dp.d_stat_err = 0;
- } else {
- // don't have it, stat it
- string path = d->path;
- path += "/";
- path += d->p->first;
- d->dp.d_stat_err = lstat(path.c_str(), &d->dp.d_stat);
- }
-
- // move up
- ++d->off;
- ++d->p;
-
- return &d->dp;
+ d->offset = offset;
}
-/*
-struct dirent_lite *Client::readdirlite(DIR *dirp)
-{
- DirResult *d = (DirResult*)dirp;
-
- // end of dir?
- if (d->p == d->contents.end())
- return 0;
-
- // fill the dirent
- d->dp.d_dirent.d_ino = d->p->second.ino;
- if (d->p->second.is_symlink())
- d->dp.d_dirent.d_type = DT_LNK;
- else if (d->p->second.is_dir())
- d->dp.d_dirent.d_type = DT_DIR;
- else if (d->p->second.is_file())
- d->dp.d_dirent.d_type = DT_REG;
- else
- d->dp.d_dirent.d_type = DT_UNKNOWN;
- strncpy(d->dp.d_dirent.d_name, d->p->first.c_str(), 256);
-
- d->dp.d_dirent.d_off = d->off;
- d->dp.d_dirent.d_reclen = 1; // all records are length 1 (wrt offset, seekdir, telldir, etc.)
-
- // plus
- if ((d->p->second.mask & INODE_MASK_ALL_STAT) == INODE_MASK_ALL_STAT) {
- // have it
- fill_statlite(d->p->second,d->dp.d_stat);
- d->dp.d_stat_err = 0;
- } else {
- // don't have it, stat it
- string path = p->path;
- path += "/";
- path += p->first;
- d->dp.d_statlite
- d->dp.d_stat_err = lstatlite(path.c_str(), &d->dp.d_statlite);
- }
-
- // move up
- ++d->off;
- ++d->p;
-
- return &d->dp;
-}
-*/
int Client::open(const char *relpath, int flags, mode_t mode)
{
- client_lock.Lock();
+ Mutex::Locker lock(client_lock);
+ tout << "open" << endl;
+ tout << relpath << endl;
+ tout << flags << endl;
string abspath;
mkabspath(relpath, abspath);
- const char *path = abspath.c_str();
- dout(3) << "op: fh = client->open(\"" << path << "\", " << flags << ");" << endl;
- tout << "open" << endl;
- tout << path << endl;
- tout << flags << endl;
+ Fh *fh;
+ int r = _open(abspath.c_str(), flags, mode, &fh);
+ if (r >= 0) {
+ // allocate a integer file descriptor
+ assert(fh);
+ r = get_fd();
+ assert(fd_map.count(r) == 0);
+ fd_map[r] = fh;
+ }
+
+ tout << r << endl;
+ dout(3) << "open(" << relpath << ", " << flags << ") = " << r << endl;
+ return r;
+}
+int Client::_open(const char *path, int flags, mode_t mode, Fh **fhp)
+{
// go
MClientRequest *req = new MClientRequest(MDS_OP_OPEN, messenger->get_myinst());
req->set_path(path);
// FIXME where does FUSE maintain user information
req->set_caller_uid(getuid());
req->set_caller_gid(getgid());
+
+ // do i have the inode?
+ Dentry *dn = lookup(req->get_filepath());
+ Inode *in = 0;
+ if (dn) {
+ in = dn->inode;
+ in->add_open(cmode); // make note of pending open, since it effects _wanted_ caps.
+ }
MClientReply *reply = make_request(req);
-
assert(reply);
insert_trace(reply);
int result = reply->get_result();
// success?
- fh_t fh = 0;
if (result >= 0) {
// yay
Fh *f = new Fh;
+ if (fhp) *fhp = f;
f->mode = cmode;
// inode
assert(f->inode);
f->inode->get();
- if (cmode & FILE_MODE_R) f->inode->num_open_rd++;
- if (cmode & FILE_MODE_W) f->inode->num_open_wr++;
- if (cmode & FILE_MODE_LAZY) f->inode->num_open_lazy++;
+ if (!in) {
+ in = f->inode;
+ in->add_open(f->mode);
+ }
// caps included?
int mds = reply->get_source().num();
- if (f->inode->caps.empty()) {// first caps?
- dout(7) << " first caps on " << f->inode->inode.ino << endl;
- f->inode->get();
+ if (in->caps.empty()) {// first caps?
+ dout(7) << " first caps on " << in->inode.ino << endl;
+ in->get();
}
int new_caps = reply->get_file_caps();
- assert(reply->get_file_caps_seq() >= f->inode->caps[mds].seq);
- if (reply->get_file_caps_seq() > f->inode->caps[mds].seq) {
- int old_caps = f->inode->caps[mds].caps;
+ assert(reply->get_file_caps_seq() >= in->caps[mds].seq);
+ if (reply->get_file_caps_seq() > in->caps[mds].seq) {
+ int old_caps = in->caps[mds].caps;
dout(7) << "open got caps " << cap_string(new_caps)
<< " (had " << cap_string(old_caps) << ")"
- << " for " << f->inode->ino()
+ << " for " << in->ino()
<< " seq " << reply->get_file_caps_seq()
<< " from mds" << mds
<< endl;
- f->inode->caps[mds].caps = new_caps;
- f->inode->caps[mds].seq = reply->get_file_caps_seq();
+ in->caps[mds].caps = new_caps;
+ in->caps[mds].seq = reply->get_file_caps_seq();
// we shouldn't ever lose caps at this point.
// actually, we might...?
- assert((old_caps & ~f->inode->caps[mds].caps) == 0);
+ assert((old_caps & ~in->caps[mds].caps) == 0);
if (g_conf.client_oc)
- f->inode->fc.set_caps(new_caps);
+ in->fc.set_caps(new_caps);
} else {
dout(7) << "open got SAME caps " << cap_string(new_caps)
- << " for " << f->inode->ino()
+ << " for " << in->ino()
<< " seq " << reply->get_file_caps_seq()
<< " from mds" << mds
<< endl;
}
- // put in map
- result = fh = get_fh();
- assert(fh_map.count(fh) == 0);
- fh_map[fh] = f;
-
- dout(3) << "open success, fh is " << fh << " combined caps " << cap_string(f->inode->file_caps()) << endl;
- } else {
- dout(0) << "open failure result " << result << endl;
+ dout(5) << "open success, fh is " << f << " combined caps " << cap_string(in->file_caps()) << endl;
}
delete reply;
trim_cache();
- client_lock.Unlock();
return result;
}
mount_cond.Signal();
}
-int Client::close(fh_t fh)
+
+int Client::close(int fd)
{
- client_lock.Lock();
- dout(3) << "op: client->close(open_files[ " << fh << " ]);" << endl;
- dout(3) << "op: open_files.erase( " << fh << " );" << endl;
+ Mutex::Locker lock(client_lock);
tout << "close" << endl;
- tout << fh << endl;
+ tout << fd << endl;
- // get Fh, Inode
- assert(fh_map.count(fh));
- Fh *f = fh_map[fh];
+ dout(3) << "close(" << fd << ")" << endl;
+ assert(fd_map.count(fd));
+ Fh *fh = fd_map[fd];
+ _release(fh);
+ fd_map.erase(fd);
+ return 0;
+}
+
+int Client::_release(Fh *f)
+{
+ //dout(3) << "op: client->close(open_files[ " << fh << " ]);" << endl;
+ //dout(3) << "op: open_files.erase( " << fh << " );" << endl;
+ dout(5) << "_release " << f << endl;
Inode *in = f->inode;
// update inode rd/wr counts
int before = in->file_caps_wanted();
- if (f->mode & FILE_MODE_R)
- in->num_open_rd--;
- if (f->mode & FILE_MODE_W)
- in->num_open_wr--;
+ in->sub_open(f->mode);
int after = in->file_caps_wanted();
// does this change what caps we want?
if (before != after && after)
update_caps_wanted(in);
- // hose fh
- fh_map.erase(fh);
- delete f;
-
// release caps right away?
dout(10) << "num_open_rd " << in->num_open_rd << " num_open_wr " << in->num_open_wr << endl;
}
put_inode( in );
- int result = 0;
-
- client_lock.Unlock();
- return result;
+ return 0;
}
// read, write
-off_t Client::lseek(fh_t fh, off_t offset, int whence)
+off_t Client::lseek(int fd, off_t offset, int whence)
{
- client_lock.Lock();
- dout(3) << "op: client->lseek(" << fh << ", " << offset << ", " << whence << ");" << endl;
+ Mutex::Locker lock(client_lock);
+ tout << "lseek" << endl;
+ tout << fd << endl;
+ tout << offset << endl;
+ tout << whence << endl;
- assert(fh_map.count(fh));
- Fh *f = fh_map[fh];
+ assert(fd_map.count(fd));
+ Fh *f = fd_map[fd];
Inode *in = f->inode;
switch (whence) {
}
off_t pos = f->pos;
- client_lock.Unlock();
+ dout(3) << "lseek(" << fd << ", " << offset << ", " << whence << ") = " << pos << endl;
return pos;
}
}
+
+//char *hackbuf = 0;
+
+
// blocking osd interface
-int Client::read(fh_t fh, char *buf, off_t size, off_t offset)
+int Client::read(int fd, char *buf, off_t size, off_t offset)
{
- client_lock.Lock();
-
- dout(3) << "op: client->read(" << fh << ", buf, " << size << ", " << offset << "); // that's " << offset << "~" << size << endl;
+ Mutex::Locker lock(client_lock);
tout << "read" << endl;
- tout << fh << endl;
+ tout << fd << endl;
tout << size << endl;
tout << offset << endl;
- assert(fh_map.count(fh));
- Fh *f = fh_map[fh];
+ assert(fd_map.count(fd));
+ Fh *f = fd_map[fd];
+ bufferlist bl;
+ int r = _read(f, offset, size, &bl);
+ dout(3) << "read(" << fd << ", " << buf << ", " << size << ", " << offset << ") = " << r << endl;
+ if (r >= 0) {
+ bl.copy(0, bl.length(), buf);
+ r = bl.length();
+ }
+ return r;
+}
+
+int Client::_read(Fh *f, off_t offset, off_t size, bufferlist *bl)
+{
Inode *in = f->inode;
bool movepos = false;
dout(10) << "file size: " << in->inode.size << endl;
if (offset > 0 && offset >= in->inode.size) {
if (movepos) unlock_fh_pos(f);
- client_lock.Unlock();
return 0;
}
if (offset + size > (off_t)in->inode.size)
if (size == 0) {
dout(10) << "read is size=0, returning 0" << endl;
if (movepos) unlock_fh_pos(f);
- client_lock.Unlock();
return 0;
}
} else {
// defer to OSDs for file bounds.
}
- bufferlist blist; // data will go here
int r = 0;
int rvalue = 0;
if (g_conf.client_oc) {
// object cache ON
- rvalue = r = in->fc.read(offset, size, blist, client_lock); // may block.
+ rvalue = r = in->fc.read(offset, size, *bl, client_lock); // may block.
+
+ /*
+ if (in->inode.ino == 0x10000000075 && hackbuf) {
+ int s = MIN(size, bl->length());
+ char *v = bl->c_str();
+ for (int a=0; a<s; a++)
+ if (v[a] != hackbuf[offset+a])
+ dout(1) << "** hackbuf differs from read value at offset " << a
+ << " hackbuf[a] = " << (int)hackbuf[a] << ", read got " << (int)v[a]
+ << endl;
+ }
+ */
+
} else {
// object cache OFF -- legacy inconsistent way.
bool done = false;
C_Cond *onfinish = new C_Cond(&cond, &done, &rvalue);
- Objecter::OSDRead *rd = filer->prepare_read(in->inode, offset, size, &blist);
+ Objecter::OSDRead *rd = filer->prepare_read(in->inode, offset, size, bl);
if (in->hack_balance_reads ||
g_conf.client_hack_balance_reads)
rd->balance_reads = true;
if (movepos) {
// adjust fd pos
- f->pos = offset+blist.length();
+ f->pos = offset+bl->length();
unlock_fh_pos(f);
}
- // copy data into caller's char* buf
- blist.copy(0, blist.length(), buf);
-
- //dout(10) << "i read '" << blist.c_str() << "'" << endl;
- dout(10) << "read rvalue " << rvalue << ", r " << r << endl;
-
// done!
- client_lock.Unlock();
return rvalue;
}
client_lock.Unlock();
}
-int Client::write(fh_t fh, const char *buf, off_t size, off_t offset)
+int Client::write(int fd, const char *buf, off_t size, off_t offset)
{
- client_lock.Lock();
-
- //dout(7) << "write fh " << fh << " size " << size << " offset " << offset << endl;
- dout(3) << "op: client->write(" << fh << ", buf, " << size << ", " << offset << ");" << endl;
+ Mutex::Locker lock(client_lock);
tout << "write" << endl;
- tout << fh << endl;
+ tout << fd << endl;
tout << size << endl;
tout << offset << endl;
- assert(fh_map.count(fh));
- Fh *f = fh_map[fh];
+ assert(fd_map.count(fd));
+ Fh *fh = fd_map[fd];
+ int r = _write(fh, offset, size, buf);
+ dout(3) << "write(" << fd << ", \"...\", " << size << ", " << offset << ") = " << r << endl;
+ return r;
+}
+
+
+int Client::_write(Fh *f, off_t offset, off_t size, const char *buf)
+{
+ //dout(7) << "write fh " << fh << " size " << size << " offset " << offset << endl;
Inode *in = f->inode;
// use/adjust fd pos?
if (g_conf.client_oc) { // buffer cache ON?
assert(objectcacher);
+ /*
+ if (f->inode->inode.ino == 0x10000000075) {
+ if (!hackbuf) {
+ dout(7) << "alloc and zero new hackbuf" << endl;
+ hackbuf = new char[16384];
+ memset(hackbuf, 0, 16384);
+ }
+ dout(7) << "hackbuf copying " << offset << "~" << size << " first is " << (int)buf[0] << endl;
+ memcpy(hackbuf+offset, buf, size);
+ for (int a=0; a<size; a++)
+ dout(10) << "hackbuf[" << (a+offset) << " = " << (int)hackbuf[a+offset] << " = " << (int)buf[a] << endl;
+ }
+ */
+
// write (this may block!)
in->fc.write(offset, size, blist, client_lock);
-
+
} else {
// legacy, inconsistent synchronous write.
dout(7) << "synchronous write" << endl;
in->file_wr_mtime = in->inode.mtime = g_clock.real_now();
// ok!
- client_lock.Unlock();
return totalwritten;
}
-int Client::truncate(const char *file, off_t length)
+int Client::truncate(const char *relpath, off_t length)
{
- client_lock.Lock();
- dout(3) << "op: client->truncate(\"" << file << "\", " << length << ");" << endl;
+ Mutex::Locker lock(client_lock);
tout << "truncate" << endl;
- tout << file << endl;
+ tout << relpath << endl;
tout << length << endl;
+ string path;
+ mkabspath(relpath, path);
+ return _truncate(path.c_str(), length);
+}
+int Client::_truncate(const char *file, off_t length)
+{
MClientRequest *req = new MClientRequest(MDS_OP_TRUNCATE, messenger->get_myinst());
req->set_path(file);
req->args.truncate.length = length;
insert_trace(reply);
delete reply;
- dout(10) << " truncate result is " << res << endl;
+ dout(3) << "truncate(\"" << file << "\", " << length << ") = " << res << endl;
+ return res;
+}
- client_lock.Unlock();
+int Client::ftruncate(int fd, off_t length)
+{
+ Mutex::Locker lock(client_lock);
+ tout << "ftruncate" << endl;
+ tout << fd << endl;
+ tout << length << endl;
+
+ assert(fd_map.count(fd));
+ Fh *f = fd_map[fd];
+ return _ftruncate(f, length);
+}
+
+int Client::_ftruncate(Fh *fh, off_t length)
+{
+ MClientRequest *req = new MClientRequest(MDS_OP_TRUNCATE, messenger->get_myinst());
+ req->args.truncate.ino = fh->inode->inode.ino;
+ req->args.truncate.length = length;
+
+ // FIXME where does FUSE maintain user information
+ req->set_caller_uid(getuid());
+ req->set_caller_gid(getgid());
+
+ MClientReply *reply = make_request(req);
+ int res = reply->get_result();
+ insert_trace(reply);
+ delete reply;
+
+ dout(3) << "ftruncate(\"" << fh << "\", " << length << ") = " << res << endl;
return res;
}
-int Client::fsync(fh_t fh, bool syncdataonly)
+int Client::fsync(int fd, bool syncdataonly)
{
- client_lock.Lock();
- dout(3) << "op: client->fsync(open_files[ " << fh << " ], " << syncdataonly << ");" << endl;
+ Mutex::Locker lock(client_lock);
tout << "fsync" << endl;
- tout << fh << endl;
+ tout << fd << endl;
tout << syncdataonly << endl;
+ assert(fd_map.count(fd));
+ Fh *f = fd_map[fd];
+ int r = _fsync(f, syncdataonly);
+ dout(3) << "fsync(" << fd << ", " << syncdataonly << ") = " << r << endl;
+ return r;
+}
+
+int Client::_fsync(Fh *f, bool syncdataonly)
+{
int r = 0;
- assert(fh_map.count(fh));
- Fh *f = fh_map[fh];
Inode *in = f->inode;
- dout(3) << "fsync fh " << fh << " ino " << in->inode.ino << " syncdataonly " << syncdataonly << endl;
-
// metadata?
if (!syncdataonly) {
dout(0) << "fsync - not syncing metadata yet.. implement me" << endl;
// wait for callback
while (!done) cond.Wait(client_lock);
}
-
- client_lock.Unlock();
return r;
}
int Client::chdir(const char *path)
{
+ Mutex::Locker lock(client_lock);
+ tout << "chdir" << endl;
+ tout << path << endl;
+
// fake it for now!
string abs;
mkabspath(path, abs);
int Client::statfs(const char *path, struct statvfs *stbuf)
{
+ Mutex::Locker lock(client_lock);
+ tout << "statfs" << endl;
+
bzero (stbuf, sizeof (struct statvfs));
// FIXME
stbuf->f_bsize = 1024;
dout(3) << "op: client->lazyio_propogate(" << fd
<< ", " << offset << ", " << count << ")" << endl;
- assert(fh_map.count(fd));
- Fh *f = fh_map[fd];
+ assert(fd_map.count(fd));
+ Fh *f = fd_map[fd];
Inode *in = f->inode;
if (f->mode & FILE_MODE_LAZY) {
dout(3) << "op: client->lazyio_synchronize(" << fd
<< ", " << offset << ", " << count << ")" << endl;
- assert(fh_map.count(fd));
- Fh *f = fh_map[fd];
+ assert(fd_map.count(fd));
+ Fh *f = fd_map[fd];
Inode *in = f->inode;
if (f->mode & FILE_MODE_LAZY) {
}
+
+
+// =========================================
+// low level
+
+// ugly hack for ll
+#define FUSE_SET_ATTR_MODE (1 << 0)
+#define FUSE_SET_ATTR_UID (1 << 1)
+#define FUSE_SET_ATTR_GID (1 << 2)
+#define FUSE_SET_ATTR_SIZE (1 << 3)
+#define FUSE_SET_ATTR_ATIME (1 << 4)
+#define FUSE_SET_ATTR_MTIME (1 << 5)
+
+int Client::ll_lookup(inodeno_t parent, const char *name, struct stat *attr)
+{
+ Mutex::Locker lock(client_lock);
+ dout(3) << "ll_lookup " << parent << " " << name << endl;
+ tout << "ll_lookup" << endl;
+ tout << parent.val << endl;
+ tout << name << endl;
+
+ string dname = name;
+ Inode *diri = 0;
+ int r = 0;
+
+ if (inode_map.count(parent) == 0) {
+ tout << 0 << endl;
+ dout(1) << "ll_lookup " << parent << " " << name << " -> ENOENT (parent DNE... WTF)" << endl;
+ r = -ENOENT;
+ goto out;
+ }
+ diri = inode_map[parent];
+ if (!diri->inode.is_dir()) {
+ tout << 0 << endl;
+ dout(1) << "ll_lookup " << parent << " " << name << " -> ENOTDIR (parent not a dir... WTF)" << endl;
+ r = -ENOTDIR;
+ goto out;
+ }
+
+
+ // refresh the dir?
+ // FIXME: this is the hackish way.
+ if (!diri->dir ||
+ diri->dir->dentries.count(dname) == 0) {
+ string path;
+ diri->make_path(path);
+ DirResult *dirp = new DirResult(path, diri);
+
+ while (1) {
+ hash<string> H;
+ dirp->set_frag(diri->dirfragtree[H(dname)]);
+
+ dout(10) << "ll_lookup fetching frag " << dirp->frag() << " for " << name << endl;
+ int r = _readdir_get_frag(dirp);
+ if (r < 0) return r;
+
+ if (dirp->buffer.count(diri->dirfragtree[H(dname)])) break;
+ dirp->buffer.clear();
+ }
+
+ _closedir(dirp);
+ }
+
+ // do we have it?
+ if (diri->dir &&
+ diri->dir->dentries.count(dname)) {
+ Inode *in = diri->dir->dentries[dname]->inode;
+ fill_stat(in, attr);
+ _ll_get(in);
+ assert(inode_map[in->inode.ino] == in);
+ } else {
+ r = -ENOENT;
+ }
+
+ out:
+ dout(3) << "ll_lookup " << parent << " " << name
+ << " -> " << r << " (" << hex << attr->st_ino << dec << ")" << endl;
+ tout << attr->st_ino << endl;
+ return r;
+}
+
+void Client::_ll_get(Inode *in)
+{
+ if (in->ll_ref == 0)
+ in->get();
+ in->ll_get();
+ dout(20) << "_ll_get " << in << " " << in->inode.ino << " -> " << in->ll_ref << endl;
+}
+
+int Client::_ll_put(Inode *in, int num)
+{
+ in->ll_put(num);
+ dout(20) << "_ll_put " << in << " " << in->inode.ino << " " << num << " -> " << in->ll_ref << endl;
+ if (in->ll_ref == 0) {
+ put_inode(in);
+ return 0;
+ } else {
+ return in->ll_ref;
+ }
+}
+
+void Client::_ll_drop_pins()
+{
+ dout(10) << "_ll_drop_pins" << endl;
+ hash_map<inodeno_t, Inode*>::iterator next;
+ for (hash_map<inodeno_t, Inode*>::iterator it = inode_map.begin();
+ it != inode_map.end();
+ it = next) {
+ Inode *in = it->second;
+ next = it;
+ next++;
+ if (in->ll_ref)
+ _ll_put(in, in->ll_ref);
+ }
+}
+
+bool Client::ll_forget(inodeno_t ino, int num)
+{
+ Mutex::Locker lock(client_lock);
+ dout(3) << "ll_forget " << ino << " " << num << endl;
+ tout << "ll_forget" << endl;
+ tout << ino.val << endl;
+ tout << num << endl;
+
+ if (ino == 1) return true; // ignore forget on root.
+
+ bool last = false;
+ if (inode_map.count(ino) == 0) {
+ dout(1) << "WARNING: ll_forget on " << ino << " " << num
+ << ", which I don't have" << endl;
+ } else {
+ Inode *in = inode_map[ino];
+ assert(in);
+ if (_ll_put(in, num) == 0)
+ last = true;
+ }
+ return last;
+}
+
+Inode *Client::_ll_get_inode(inodeno_t ino)
+{
+ if (inode_map.count(ino) == 0) {
+ assert(ino == 1); // must be the root inode.
+ Inode *in;
+ int r = _do_lstat("/", 0, &in);
+ assert(r >= 0);
+ return in;
+ } else {
+ return inode_map[ino];
+ }
+}
+
+
+int Client::ll_getattr(inodeno_t ino, struct stat *attr)
+{
+ Mutex::Locker lock(client_lock);
+ dout(3) << "ll_getattr " << ino << endl;
+ tout << "ll_getattr" << endl;
+ tout << ino.val << endl;
+
+ Inode *in = _ll_get_inode(ino);
+ fill_stat(in, attr);
+ return 0;
+}
+
+int Client::ll_setattr(inodeno_t ino, struct stat *attr, int mask)
+{
+ Mutex::Locker lock(client_lock);
+ dout(3) << "ll_setattr " << ino << " mask " << hex << mask << dec << endl;
+ tout << "ll_setattr" << endl;
+ tout << ino.val << endl;
+ tout << attr->st_mode << endl;
+ tout << attr->st_uid << endl;
+ tout << attr->st_gid << endl;
+ tout << attr->st_size << endl;
+ tout << attr->st_mtime << endl;
+ tout << attr->st_atime << endl;
+ tout << mask << endl;
+
+ Inode *in = _ll_get_inode(ino);
+
+ string path;
+ in->make_path(path);
+
+ int r;
+ if ((mask & FUSE_SET_ATTR_MODE) &&
+ ((r = _chmod(path.c_str(), attr->st_mode)) < 0)) return r;
+
+ if ((mask & FUSE_SET_ATTR_UID) && (mask & FUSE_SET_ATTR_GID) &&
+ ((r = _chown(path.c_str(), attr->st_uid, attr->st_gid)) < 0)) return r;
+ //if ((mask & FUSE_SET_ATTR_GID) &&
+ //(r = client->_chgrp(path.c_str(), attr->st_gid) < 0)) return r;
+
+ if ((mask & FUSE_SET_ATTR_SIZE) &&
+ ((r = _truncate(path.c_str(), attr->st_size)) < 0)) return r;
+
+ if ((mask & FUSE_SET_ATTR_MTIME) && (mask & FUSE_SET_ATTR_ATIME)) {
+ if ((r = _utimes(path.c_str(), utime_t(attr->st_mtime,0), utime_t(attr->st_atime,0))) < 0) return r;
+ } else if (mask & FUSE_SET_ATTR_MTIME) {
+ if ((r = _utimes(path.c_str(), utime_t(attr->st_mtime,0), utime_t())) < 0) return r;
+ } else if (mask & FUSE_SET_ATTR_ATIME) {
+ if ((r = _utimes(path.c_str(), utime_t(), utime_t(attr->st_atime,0))) < 0) return r;
+ }
+
+ assert(r == 0);
+ fill_stat(in, attr);
+
+ dout(3) << "ll_setattr " << ino << " = " << r << endl;
+ return 0;
+}
+
+int Client::ll_readlink(inodeno_t ino, const char **value)
+{
+ Mutex::Locker lock(client_lock);
+ dout(3) << "ll_readlink " << ino << endl;
+ tout << "ll_readlink" << endl;
+ tout << ino.val << endl;
+
+ Inode *in = _ll_get_inode(ino);
+ int r = 0;
+ if (in->inode.is_symlink()) {
+ *value = in->symlink->c_str();
+ } else {
+ *value = "";
+ r = -EINVAL;
+ }
+ dout(3) << "ll_readlink " << ino << " = " << r << " (" << *value << ")" << endl;
+ return r;
+}
+
+int Client::ll_mknod(inodeno_t parent, const char *name, mode_t mode, dev_t rdev, struct stat *attr)
+{
+ Mutex::Locker lock(client_lock);
+ dout(3) << "ll_mknod " << parent << " " << name << endl;
+ tout << "ll_mknod" << endl;
+ tout << parent.val << endl;
+ tout << name << endl;
+ tout << mode << endl;
+ tout << rdev << endl;
+
+ Inode *diri = _ll_get_inode(parent);
+
+ string path;
+ diri->make_path(path);
+ path += "/";
+ path += name;
+ int r = _mknod(path.c_str(), mode, rdev);
+ if (r == 0) {
+ string dname(name);
+ Inode *in = diri->dir->dentries[dname]->inode;
+ fill_stat(in, attr);
+ _ll_get(in);
+ }
+ tout << attr->st_ino << endl;
+ dout(3) << "ll_mknod " << parent << " " << name
+ << " = " << r << " (" << hex << attr->st_ino << dec << ")" << endl;
+ return r;
+}
+
+int Client::ll_mkdir(inodeno_t parent, const char *name, mode_t mode, struct stat *attr)
+{
+ Mutex::Locker lock(client_lock);
+ dout(3) << "ll_mkdir " << parent << " " << name << endl;
+ tout << "ll_mkdir" << endl;
+ tout << parent.val << endl;
+ tout << name << endl;
+ tout << mode << endl;
+
+ Inode *diri = _ll_get_inode(parent);
+
+ string path;
+ diri->make_path(path);
+ path += "/";
+ path += name;
+ int r = _mkdir(path.c_str(), mode);
+ if (r == 0) {
+ string dname(name);
+ Inode *in = diri->dir->dentries[dname]->inode;
+ fill_stat(in, attr);
+ _ll_get(in);
+ }
+ tout << attr->st_ino << endl;
+ dout(3) << "ll_mkdir " << parent << " " << name
+ << " = " << r << " (" << hex << attr->st_ino << dec << ")" << endl;
+ return r;
+}
+
+int Client::ll_symlink(inodeno_t parent, const char *name, const char *value, struct stat *attr)
+{
+ Mutex::Locker lock(client_lock);
+ dout(3) << "ll_symlink " << parent << " " << name << " -> " << value << endl;
+ tout << "ll_symlink" << endl;
+ tout << parent.val << endl;
+ tout << name << endl;
+ tout << value << endl;
+
+ Inode *diri = _ll_get_inode(parent);
+
+ string path;
+ diri->make_path(path);
+ path += "/";
+ path += name;
+ int r = _symlink(value, path.c_str());
+ if (r == 0) {
+ string dname(name);
+ Inode *in = diri->dir->dentries[dname]->inode;
+ fill_stat(in, attr);
+ _ll_get(in);
+ }
+ tout << attr->st_ino << endl;
+ dout(3) << "ll_symlink " << parent << " " << name
+ << " = " << r << " (" << hex << attr->st_ino << dec << ")" << endl;
+ return r;
+}
+
+int Client::ll_unlink(inodeno_t ino, const char *name)
+{
+ Mutex::Locker lock(client_lock);
+ dout(3) << "ll_unlink " << ino << " " << name << endl;
+ tout << "ll_unlink" << endl;
+ tout << ino.val << endl;
+ tout << name << endl;
+
+ Inode *diri = _ll_get_inode(ino);
+
+ string path;
+ diri->make_path(path);
+ path += "/";
+ path += name;
+ return _unlink(path.c_str());
+}
+
+int Client::ll_rmdir(inodeno_t ino, const char *name)
+{
+ Mutex::Locker lock(client_lock);
+ dout(3) << "ll_rmdir " << ino << " " << name << endl;
+ tout << "ll_rmdir" << endl;
+ tout << ino.val << endl;
+ tout << name << endl;
+
+ Inode *diri = _ll_get_inode(ino);
+
+ string path;
+ diri->make_path(path);
+ path += "/";
+ path += name;
+ return _rmdir(path.c_str());
+}
+
+int Client::ll_rename(inodeno_t parent, const char *name, inodeno_t newparent, const char *newname)
+{
+ Mutex::Locker lock(client_lock);
+ dout(3) << "ll_rename " << parent << " " << name << " to "
+ << newparent << " " << newname << endl;
+ tout << "ll_rename" << endl;
+ tout << parent.val << endl;
+ tout << name << endl;
+ tout << newparent.val << endl;
+ tout << newname << endl;
+
+ Inode *diri = _ll_get_inode(parent);
+ string path;
+ diri->make_path(path);
+ path += "/";
+ path += name;
+
+ Inode *newdiri = _ll_get_inode(newparent);
+ string newpath;
+ newdiri->make_path(newpath);
+ newpath += "/";
+ newpath += newname;
+
+ return _rename(path.c_str(), newpath.c_str());
+}
+
+int Client::ll_link(inodeno_t ino, inodeno_t newparent, const char *newname, struct stat *attr)
+{
+ Mutex::Locker lock(client_lock);
+ dout(3) << "ll_link " << ino << " to " << newparent << " " << newname << endl;
+ tout << "ll_link" << endl;
+ tout << ino.val << endl;
+ tout << newparent << endl;
+ tout << newname << endl;
+
+ Inode *old = _ll_get_inode(ino);
+ Inode *diri = _ll_get_inode(newparent);
+
+ string path;
+ old->make_path(path);
+
+ string newpath;
+ diri->make_path(newpath);
+ newpath += "/";
+ newpath += newname;
+
+ int r = _link(path.c_str(), newpath.c_str());
+ if (r == 0) {
+ string dname(newname);
+ Inode *in = diri->dir->dentries[dname]->inode;
+ fill_stat(in, attr);
+ _ll_get(in);
+ }
+ return r;
+}
+
+int Client::ll_opendir(inodeno_t ino, void **dirpp)
+{
+ Mutex::Locker lock(client_lock);
+ dout(3) << "ll_opendir " << ino << endl;
+ tout << "ll_opendir" << endl;
+ tout << ino.val << endl;
+
+ Inode *diri = inode_map[ino];
+ assert(diri);
+ string path;
+ diri->make_path(path);
+
+ int r = _opendir(path.c_str(), (DirResult**)dirpp);
+
+ tout << (unsigned long)*dirpp << endl;
+
+ dout(3) << "ll_opendir " << ino << " = " << r << " (" << *dirpp << ")" << endl;
+ return r;
+}
+
+void Client::ll_releasedir(void *dirp)
+{
+ Mutex::Locker lock(client_lock);
+ dout(3) << "ll_releasedir " << dirp << endl;
+ tout << "ll_releasedir" << endl;
+ tout << (unsigned long)dirp << endl;
+ _closedir((DirResult*)dirp);
+}
+
+int Client::ll_open(inodeno_t ino, int flags, Fh **fhp)
+{
+ Mutex::Locker lock(client_lock);
+ dout(3) << "ll_open " << ino << " " << flags << endl;
+ tout << "ll_open" << endl;
+ tout << ino.val << endl;
+ tout << flags << endl;
+
+ Inode *in = _ll_get_inode(ino);
+ string path;
+ in->make_path(path);
+
+ int r = _open(path.c_str(), flags, 0, fhp);
+
+ tout << (unsigned long)*fhp << endl;
+ dout(3) << "ll_open " << ino << " " << flags << " = " << r << " (" << *fhp << ")" << endl;
+ return r;
+}
+
+int Client::ll_create(inodeno_t parent, const char *name, mode_t mode, int flags,
+ struct stat *attr, Fh **fhp)
+{
+ Mutex::Locker lock(client_lock);
+ dout(3) << "ll_create " << parent << " " << name << " 0" << oct << mode << dec << " " << flags << endl;
+ tout << "ll_create" << endl;
+ tout << parent.val << endl;
+ tout << name << endl;
+ tout << mode << endl;
+ tout << flags << endl;
+
+ Inode *pin = _ll_get_inode(parent);
+ string path;
+ pin->make_path(path);
+ path += "/";
+ path += name;
+
+ int r = _open(path.c_str(), flags|O_CREAT, mode, fhp);
+ if (r >= 0) {
+ Inode *in = (*fhp)->inode;
+ fill_stat(in, attr);
+ //_ll_get(in);
+ }
+ tout << (unsigned long)*fhp << endl;
+ tout << attr->st_ino << endl;
+ dout(3) << "ll_create " << parent << " " << name << " 0" << oct << mode << dec << " " << flags
+ << " = " << r << " (" << *fhp << " " << hex << attr->st_ino << dec << ")" << endl;
+ return 0;
+}
+
+int Client::ll_read(Fh *fh, off_t off, off_t len, bufferlist *bl)
+{
+ Mutex::Locker lock(client_lock);
+ dout(3) << "ll_read " << fh << " " << off << "~" << len << endl;
+ tout << "ll_read" << endl;
+ tout << (unsigned long)fh << endl;
+ tout << off << endl;
+ tout << len << endl;
+
+ return _read(fh, off, len, bl);
+}
+
+int Client::ll_write(Fh *fh, off_t off, off_t len, const char *data)
+{
+ Mutex::Locker lock(client_lock);
+ dout(3) << "ll_write " << fh << " " << off << "~" << len << endl;
+ tout << "ll_write" << endl;
+ tout << (unsigned long)fh << endl;
+ tout << off << endl;
+ tout << len << endl;
+
+ return _write(fh, off, len, data);
+}
+
+int Client::ll_release(Fh *fh)
+{
+ Mutex::Locker lock(client_lock);
+ dout(3) << "ll_release " << fh << endl;
+ tout << "ll_release" << endl;
+ tout << (unsigned long)fh << endl;
+
+ _release(fh);
+ return 0;
+}
+
+
+
+
+
+
// =========================================
// layout
-int Client::describe_layout(int fh, FileLayout *lp)
+int Client::describe_layout(int fd, FileLayout *lp)
{
- client_lock.Lock();
- dout(3) << "op: client->describe_layout(" << fh << ");" << endl;
+ Mutex::Locker lock(client_lock);
- assert(fh_map.count(fh));
- Fh *f = fh_map[fh];
+ assert(fd_map.count(fd));
+ Fh *f = fd_map[fd];
Inode *in = f->inode;
*lp = in->inode.layout;
- client_lock.Unlock();
+ dout(3) << "describe_layout(" << fd << ") = 0" << endl;
return 0;
}
return layout.period();
}
-int Client::enumerate_layout(int fh, list<ObjectExtent>& result,
+int Client::enumerate_layout(int fd, list<ObjectExtent>& result,
off_t length, off_t offset)
{
- client_lock.Lock();
- dout(3) << "op: client->enumerate_layout(" << fh << ", " << length << ", " << offset << ");" << endl;
+ Mutex::Locker lock(client_lock);
- assert(fh_map.count(fh));
- Fh *f = fh_map[fh];
+ assert(fd_map.count(fd));
+ Fh *f = fd_map[fd];
Inode *in = f->inode;
// map to a list of extents
filer->file_to_extents(in->inode, offset, length, result);
- client_lock.Unlock();
+ dout(3) << "enumerate_layout(" << fd << ", " << length << ", " << offset << ") = 0" << endl;
return 0;
}
// stl
#include <set>
#include <map>
+#include <fstream>
using namespace std;
#include <ext/hash_map>
*/
-typedef int fh_t;
-
class Dir;
class Inode;
public:
inode_t inode; // the actual inode
utime_t valid_until;
+ int mask;
// about the dir (if this is one!)
int dir_auth;
int num_open_rd, num_open_wr, num_open_lazy; // num readers, writers
int ref; // ref count. 1 for each dentry, fh that links to me.
+ int ll_ref; // separate ref count for ll client
Dir *dir; // if i'm a dir.
Dentry *dn; // if i'm linked to a dentry.
string *symlink; // symlink content, if it's a symlink
+ fragtree_t dirfragtree;
// for caching i/o mode
FileCache fc;
void get() {
ref++;
- //cout << "inode.get on " << hex << inode.ino << dec << " now " << ref << endl;
+ //cout << "inode.get on " << this << " " << hex << inode.ino << dec << " now " << ref << endl;
+ }
+ void put(int n=1) {
+ ref -= n; assert(ref >= 0);
+ //cout << "inode.put on " << this << " " << hex << inode.ino << dec << " now " << ref << endl;
+ }
+
+ void ll_get() {
+ ll_ref++;
}
- void put() {
- ref--; assert(ref >= 0);
- //cout << "inode.put on " << hex << inode.ino << dec << " now " << ref << endl;
+ void ll_put(int n=1) {
+ assert(ll_ref >= n);
+ ll_ref -= n;
}
Inode(inode_t _inode, ObjectCacher *_oc) :
dir_auth(-1), dir_hashed(false), dir_replicated(false),
file_wr_mtime(0, 0), file_wr_size(0),
num_open_rd(0), num_open_wr(0), num_open_lazy(0),
- ref(0), dir(0), dn(0), symlink(0),
+ ref(0), ll_ref(0),
+ dir(0), dn(0), symlink(0),
fc(_oc, _inode),
sync_reads(0), sync_writes(0),
hack_balance_reads(false)
return w;
}
+ void add_open(int cmode) {
+ if (cmode & FILE_MODE_R) num_open_rd++;
+ if (cmode & FILE_MODE_W) num_open_wr++;
+ if (cmode & FILE_MODE_LAZY) num_open_lazy++;
+ }
+ void sub_open(int cmode) {
+ if (cmode & FILE_MODE_R) num_open_rd--;
+ if (cmode & FILE_MODE_W) num_open_wr--;
+ if (cmode & FILE_MODE_LAZY) num_open_lazy--;
+ }
+
int authority(MDSMap *mdsmap) {
//cout << "authority on " << inode.ino << " .. dir_auth is " << dir_auth<< endl;
// parent?
public:
/* getdir result */
+ struct DirEntry {
+ string d_name;
+ struct stat st;
+ int stmask;
+ DirEntry(const string &s) : d_name(s), stmask(0) {}
+ DirEntry(const string &n, struct stat& s, int stm) : d_name(n), st(s), stmask(stm) {}
+ };
+
struct DirResult {
+ static const int SHIFT = 28;
+ static const int64_t MASK = (1 << SHIFT) - 1;
+ static const off_t END = 1ULL << (SHIFT + 32);
+
string path;
- map<string,inode_t> contents;
- map<string,inode_t>::iterator p;
- int off;
- int size;
- struct dirent_plus dp;
- struct dirent_lite dl;
- DirResult() : p(contents.end()), off(-1), size(0) {}
+ Inode *inode;
+ int64_t offset; // high bits: frag_t, low bits: an offset
+ map<frag_t, vector<DirEntry> > buffer;
+
+ DirResult(const char *p, Inode *in=0) : path(p), inode(in), offset(0) {
+ if (inode) inode->get();
+ }
+ DirResult(const string &p, Inode *in=0) : path(p), inode(in), offset(0) {
+ if (inode) inode->get();
+ }
+
+ frag_t frag() { return frag_t(offset >> SHIFT); }
+ unsigned fragpos() { return offset & MASK; }
+
+ void next_frag() {
+ frag_t fg = offset >> SHIFT;
+ if (fg.is_rightmost())
+ set_end();
+ else
+ set_frag(fg.next());
+ }
+ void set_frag(frag_t f) {
+ offset = (uint64_t)f << SHIFT;
+ assert(sizeof(offset) == 8);
+ }
+ void set_end() { offset = END; }
+ bool at_end() { return (offset == END); }
};
// file handles, etc.
string cwd;
- interval_set<fh_t> free_fh_set; // unused fh's
- hash_map<fh_t, Fh*> fh_map;
+ interval_set<int> free_fd_set; // unused fds
+ hash_map<int, Fh*> fd_map;
- fh_t get_fh() {
- fh_t fh = free_fh_set.start();
- free_fh_set.erase(fh, 1);
- return fh;
+ int get_fd() {
+ int fd = free_fd_set.start();
+ free_fd_set.erase(fd, 1);
+ return fd;
}
- void put_fh(fh_t fh) {
- free_fh_set.insert(fh, 1);
+ void put_fd(int fd) {
+ free_fd_set.insert(fd, 1);
}
void mkabspath(const char *rel, string& abs) {
// -- metadata cache stuff
// decrease inode ref. delete if dangling.
- void put_inode(Inode *in) {
- in->put();
+ void put_inode(Inode *in, int n=1) {
+ //cout << "put_inode on " << in << " " << in->inode.ino << endl;
+ in->put(n);
if (in->ref == 0) {
+ //cout << "put_inode deleting " << in->inode.ino << endl;
inode_map.erase(in->inode.ino);
if (in == root) root = 0;
delete in;
put_inode(in); // unpin inode
}
- int get_cache_size() { return lru.lru_get_size(); }
- void set_cache_size(int m) { lru.lru_set_max(m); }
+ //int get_cache_size() { return lru.lru_get_size(); }
+ //void set_cache_size(int m) { lru.lru_set_max(m); }
Dentry* link(Dir *dir, const string& name, Inode *in) {
Dentry *dn = new Dentry;
// find dentry based on filepath
Dentry *lookup(filepath& path);
- void fill_stat(inode_t& inode, struct stat *st);
- void fill_statlite(inode_t& inode, struct statlite *st);
+ int fill_stat(Inode *in, struct stat *st);
+
+
+ // trace generation
+ ofstream traceout;
// friends
}
};
+ // some helpers
+ int _do_lstat(const char *path, int mask, Inode **in);
+ int _opendir(const char *name, DirResult **dirpp);
+ void _readdir_add_dirent(DirResult *dirp, const string& name, Inode *in);
+ void _readdir_add_dirent(DirResult *dirp, const string& name, unsigned char d_type);
+ bool _readdir_have_frag(DirResult *dirp);
+ void _readdir_next_frag(DirResult *dirp);
+ void _readdir_rechoose_frag(DirResult *dirp);
+ int _readdir_get_frag(DirResult *dirp);
+ void _readdir_fill_dirent(struct dirent *de, DirEntry *entry, off_t);
+ void _closedir(DirResult *dirp);
+ void _ll_get(Inode *in);
+ int _ll_put(Inode *in, int num);
+ void _ll_drop_pins();
+
+ // internal interface
+ // call these with client_lock held!
+ int _link(const char *existing, const char *newname);
+ int _unlink(const char *path);
+ int _rename(const char *from, const char *to);
+ int _mkdir(const char *path, mode_t mode);
+ int _rmdir(const char *path);
+ int _readlink(const char *path, char *buf, off_t size);
+ int _symlink(const char *existing, const char *newname);
+ int _lstat(const char *path, struct stat *stbuf);
+ int _chmod(const char *relpath, mode_t mode);
+ int _chown(const char *relpath, uid_t uid, gid_t gid);
+ int _utimes(const char *relpath, utime_t mtime, utime_t atime);
+ int _mknod(const char *path, mode_t mode, dev_t rdev);
+ int _open(const char *path, int flags, mode_t mode, Fh **fhp);
+ int _release(Fh *fh);
+ int _read(Fh *fh, off_t offset, off_t size, bufferlist *bl);
+ int _write(Fh *fh, off_t offset, off_t size, const char *buf);
+ int _truncate(const char *file, off_t length);
+ int _ftruncate(Fh *fh, off_t length);
+ int _fsync(Fh *fh, bool syncdataonly);
+
+
public:
int mount();
int unmount();
const string getcwd() { return cwd; }
// namespace ops
- int getdir(const char *path, list<string>& contents);
- int getdir(const char *path, map<string,inode_t>& contents);
+ int getdir(const char *relpath, list<string>& names); // get the whole dir at once.
- DIR *opendir(const char *name);
- int closedir(DIR *dir);
- struct dirent *readdir(DIR *dir);
- void rewinddir(DIR *dir);
- off_t telldir(DIR *dir);
- void seekdir(DIR *dir, off_t offset);
+ int opendir(const char *name, DIR **dirpp);
+ int closedir(DIR *dirp);
+ int readdir_r(DIR *dirp, struct dirent *de);
+ int readdirplus_r(DIR *dirp, struct dirent *de, struct stat *st, int *stmask);
+ void rewinddir(DIR *dirp);
+ off_t telldir(DIR *dirp);
+ void seekdir(DIR *dirp, off_t offset);
struct dirent_plus *readdirplus(DIR *dirp);
int readdirplus_r(DIR *dirp, struct dirent_plus *entry, struct dirent_plus **result);
struct dirent_lite *readdirlite(DIR *dirp);
int readdirlite_r(DIR *dirp, struct dirent_lite *entry, struct dirent_lite **result);
-
int link(const char *existing, const char *newname);
int unlink(const char *path);
int rename(const char *from, const char *to);
int symlink(const char *existing, const char *newname);
// inode stuff
- int _lstat(const char *path, int mask, Inode **in);
int lstat(const char *path, struct stat *stbuf);
int lstatlite(const char *path, struct statlite *buf);
int chmod(const char *path, mode_t mode);
int chown(const char *path, uid_t uid, gid_t gid);
int utime(const char *path, struct utimbuf *buf);
-
+
// file ops
- int mknod(const char *path, mode_t mode);
+ int mknod(const char *path, mode_t mode, dev_t rdev=0);
int open(const char *path, int flags, mode_t mode=0);
- int close(fh_t fh);
- off_t lseek(fh_t fh, off_t offset, int whence);
- int read(fh_t fh, char *buf, off_t size, off_t offset=-1);
- int write(fh_t fh, const char *buf, off_t size, off_t offset=-1);
+ int close(int fd);
+ off_t lseek(int fd, off_t offset, int whence);
+ int read(int fd, char *buf, off_t size, off_t offset=-1);
+ int write(int fd, const char *buf, off_t size, off_t offset=-1);
+ int fake_write_size(int fd, off_t size);
int truncate(const char *file, off_t size);
- //int truncate(fh_t fh, long long size);
- int fsync(fh_t fh, bool syncdataonly);
-
+ int ftruncate(int fd, off_t size);
+ int fsync(int fd, bool syncdataonly);
// hpc lazyio
int lazyio_propogate(int fd, off_t offset, size_t count);
int enumerate_layout(int fd, list<ObjectExtent>& result,
off_t length, off_t offset);
+ // low-level interface
+ int ll_lookup(inodeno_t parent, const char *name, struct stat *attr);
+ bool ll_forget(inodeno_t ino, int count);
+ Inode *_ll_get_inode(inodeno_t ino);
+ int ll_getattr(inodeno_t ino, struct stat *st);
+ int ll_setattr(inodeno_t ino, struct stat *st, int mask);
+ int ll_opendir(inodeno_t ino, void **dirpp);
+ void ll_releasedir(void *dirp);
+ int ll_readlink(inodeno_t ino, const char **value);
+ int ll_mknod(inodeno_t ino, const char *name, mode_t mode, dev_t rdev, struct stat *attr);
+ int ll_mkdir(inodeno_t ino, const char *name, mode_t mode, struct stat *attr);
+ int ll_symlink(inodeno_t ino, const char *name, const char *value, struct stat *attr);
+ int ll_unlink(inodeno_t ino, const char *name);
+ int ll_rmdir(inodeno_t ino, const char *name);
+ int ll_rename(inodeno_t parent, const char *name, inodeno_t newparent, const char *newname);
+ int ll_link(inodeno_t ino, inodeno_t newparent, const char *newname, struct stat *attr);
+ int ll_open(inodeno_t ino, int flags, Fh **fh);
+ int ll_create(inodeno_t parent, const char *name, mode_t mode, int flags, struct stat *attr, Fh **fh);
+ int ll_read(Fh *fh, off_t off, off_t len, bufferlist *bl);
+ int ll_write(Fh *fh, off_t off, off_t len, const char *data);
+ int ll_release(Fh *fh);
+
+
// failure
void ms_handle_failure(Message*, const entity_inst_t& inst);
};
#include <sys/types.h>
#include <utime.h>
#include <math.h>
+#include <sys/statvfs.h>
#include "config.h"
#undef dout
syn_modes.push_back( SYNCLIENT_MODE_TRUNCATE );
syn_sargs.push_back(args[++i]);
syn_iargs.push_back(atoi(args[++i]));
+ } else if (strcmp(args[i],"importfind") == 0) {
+ syn_modes.push_back(SYNCLIENT_MODE_IMPORTFIND);
+ syn_sargs.push_back(args[++i]);
+ syn_sargs.push_back(args[++i]);
+ syn_iargs.push_back(atoi(args[++i]));
} else {
cerr << "unknown syn arg " << args[i] << endl;
assert(0);
utime_t start = g_clock.now();
if (time_to_stop()) break;
- play_trace(t, prefix);
+ play_trace(t, prefix, false);
if (time_to_stop()) break;
clean_dir(prefix);
{
int count = iargs.front(); iargs.pop_front();
if (run_me()) {
- client->mknod("test",0777);
+ client->mknod("test", 0777);
struct stat st;
for (int i=0; i<count; i++) {
client->lstat("test", &st);
client->truncate(file.c_str(), iarg1);
}
break;
+
+
+ case SYNCLIENT_MODE_IMPORTFIND:
+ {
+ string base = get_sarg(0);
+ string find = get_sarg(0);
+ int data = get_iarg();
+ if (run_me())
+ import_find(base.c_str(), find.c_str(), data);
+ }
+ break;
default:
assert(0);
}
-int SyntheticClient::play_trace(Trace& t, string& prefix)
+int SyntheticClient::play_trace(Trace& t, string& prefix, bool metadata_only)
{
dout(4) << "play trace" << endl;
t.start();
+ char buf[1024];
+
utime_t start = g_clock.now();
const char *p = prefix.c_str();
- map<int64_t, int64_t> open_files;
+ hash_map<int64_t, int64_t> open_files;
+ hash_map<int64_t, DIR*> open_dirs;
+
+ hash_map<int64_t, Fh*> ll_files;
+ hash_map<int64_t, void*> ll_dirs;
+ hash_map<uint64_t, int64_t> ll_inos;
+
+ ll_inos[1] = 1; // root inode is known.
while (!t.end()) {
if (time_to_stop()) break;
// op
- const char *op = t.get_string();
- dout(4) << "trace op " << op << endl;
+ const char *op = t.get_string(buf, 0);
+ dout(4) << (t.get_line()-1) << ": trace op " << op << endl;
+
+ // high level ops ---------------------
if (strcmp(op, "link") == 0) {
- const char *a = t.get_string(p);
- const char *b = t.get_string(p);
+ const char *a = t.get_string(buf, p);
+ const char *b = t.get_string(buf, p);
client->link(a,b);
} else if (strcmp(op, "unlink") == 0) {
- const char *a = t.get_string(p);
+ const char *a = t.get_string(buf, p);
client->unlink(a);
} else if (strcmp(op, "rename") == 0) {
- const char *a = t.get_string(p);
- const char *b = t.get_string(p);
+ const char *a = t.get_string(buf, p);
+ const char *b = t.get_string(buf, p);
client->rename(a,b);
} else if (strcmp(op, "mkdir") == 0) {
- const char *a = t.get_string(p);
+ const char *a = t.get_string(buf, p);
int64_t b = t.get_int();
client->mkdir(a, b);
} else if (strcmp(op, "rmdir") == 0) {
- const char *a = t.get_string(p);
+ const char *a = t.get_string(buf, p);
client->rmdir(a);
} else if (strcmp(op, "symlink") == 0) {
- const char *a = t.get_string(p);
- const char *b = t.get_string(p);
+ const char *a = t.get_string(buf, p);
+ const char *b = t.get_string(buf, p);
client->symlink(a,b);
} else if (strcmp(op, "readlink") == 0) {
- const char *a = t.get_string(p);
+ const char *a = t.get_string(buf, p);
char buf[100];
client->readlink(a, buf, 100);
} else if (strcmp(op, "lstat") == 0) {
struct stat st;
- const char *a = t.get_string(p);
+ const char *a = t.get_string(buf, p);
client->lstat(a, &st);
} else if (strcmp(op, "chmod") == 0) {
- const char *a = t.get_string(p);
+ const char *a = t.get_string(buf, p);
int64_t b = t.get_int();
client->chmod(a, b);
} else if (strcmp(op, "chown") == 0) {
- const char *a = t.get_string(p);
+ const char *a = t.get_string(buf, p);
int64_t b = t.get_int();
int64_t c = t.get_int();
client->chown(a, b, c);
} else if (strcmp(op, "utime") == 0) {
- const char *a = t.get_string(p);
+ const char *a = t.get_string(buf, p);
int64_t b = t.get_int();
int64_t c = t.get_int();
struct utimbuf u;
u.modtime = c;
client->utime(a, &u);
} else if (strcmp(op, "mknod") == 0) {
- const char *a = t.get_string(p);
+ const char *a = t.get_string(buf, p);
int64_t b = t.get_int();
- client->mknod(a, b);
+ int64_t c = t.get_int();
+ client->mknod(a, b, c);
+ } else if (strcmp(op, "getdir") == 0) {
+ const char *a = t.get_string(buf, p);
+ list<string> contents;
+ client->getdir(a, contents);
} else if (strcmp(op, "getdir") == 0) {
- const char *a = t.get_string(p);
- map<string,inode_t> contents;
+ const char *a = t.get_string(buf, p);
+ list<string> contents;
client->getdir(a, contents);
+ } else if (strcmp(op, "opendir") == 0) {
+ const char *a = t.get_string(buf, p);
+ int64_t b = t.get_int();
+ DIR *dirp;
+ client->opendir(a, &dirp);
+ if (dirp) open_dirs[b] = dirp;
+ } else if (strcmp(op, "closedir") == 0) {
+ int64_t a = t.get_int();
+ client->closedir(open_dirs[a]);
+ open_dirs.erase(a);
} else if (strcmp(op, "open") == 0) {
- const char *a = t.get_string(p);
+ const char *a = t.get_string(buf, p);
int64_t b = t.get_int();
- int64_t id = t.get_int();
- int64_t fh = client->open(a, b);
- open_files[id] = fh;
+ int64_t c = t.get_int();
+ int64_t d = t.get_int();
+ int64_t fd = client->open(a, b, c);
+ if (fd > 0) open_files[d] = fd;
} else if (strcmp(op, "close") == 0) {
int64_t id = t.get_int();
int64_t fh = open_files[id];
if (fh > 0) client->close(fh);
open_files.erase(id);
- } else if (strcmp(op, "truncate") == 0) {
- const char *a = t.get_string(p);
- int64_t b = t.get_int();
- client->truncate(a,b);
- } else if (strcmp(op, "read") == 0) {
- int64_t id = t.get_int();
- int64_t fh = open_files[id];
- int size = t.get_int();
- int off = t.get_int();
- char *buf = new char[size];
- client->read(fh, buf, size, off);
- delete[] buf;
} else if (strcmp(op, "lseek") == 0) {
- int64_t id = t.get_int();
- int64_t fh = open_files[id];
- int off = t.get_int();
- int whence = t.get_int();
- client->lseek(fh, off, whence);
+ int64_t f = t.get_int();
+ int fd = open_files[f];
+ int64_t off = t.get_int();
+ int64_t whence = t.get_int();
+ client->lseek(fd, off, whence);
+ } else if (strcmp(op, "read") == 0) {
+ int64_t f = t.get_int();
+ int64_t size = t.get_int();
+ int64_t off = t.get_int();
+ int64_t fd = open_files[f];
+ if (!metadata_only) {
+ char *b = new char[size];
+ client->read(fd, b, size, off);
+ delete[] b;
+ }
} else if (strcmp(op, "write") == 0) {
- int64_t id = t.get_int();
- int64_t fh = open_files[id];
- int size = t.get_int();
- int off = t.get_int();
- char *buf = new char[size];
- memset(buf, 1, size); // let's write 1's!
- client->write(fh, buf, size, off);
- delete[] buf;
+ int64_t f = t.get_int();
+ int64_t fd = open_files[f];
+ int64_t size = t.get_int();
+ int64_t off = t.get_int();
+ if (!metadata_only) {
+ char *b = new char[size];
+ memset(b, 1, size); // let's write 1's!
+ client->write(fd, b, size, off);
+ delete[] b;
+ }
+ } else if (strcmp(op, "truncate") == 0) {
+ const char *a = t.get_string(buf, p);
+ int64_t l = t.get_int();
+ client->truncate(a, l);
+ } else if (strcmp(op, "ftruncate") == 0) {
+ int64_t f = t.get_int();
+ int fd = open_files[f];
+ int64_t l = t.get_int();
+ client->ftruncate(fd, l);
} else if (strcmp(op, "fsync") == 0) {
+ int64_t f = t.get_int();
+ int64_t b = t.get_int();
+ int fd = open_files[f];
+ client->fsync(fd, b);
+ } else if (strcmp(op, "chdir") == 0) {
+ const char *a = t.get_string(buf, p);
+ client->chdir(a);
+ } else if (strcmp(op, "statfs") == 0) {
+ struct statvfs stbuf;
+ client->statfs("/", &stbuf);
+ }
+
+ // low level ops ---------------------
+ else if (strcmp(op, "ll_lookup") == 0) {
+ int64_t i = t.get_int();
+ const char *name = t.get_string(buf, p);
+ int64_t r = t.get_int();
+ struct stat attr;
+ if (client->ll_lookup(ll_inos[i], name, &attr) == 0)
+ ll_inos[r] = attr.st_ino;
+ } else if (strcmp(op, "ll_forget") == 0) {
+ int64_t i = t.get_int();
+ int64_t n = t.get_int();
+ if (client->ll_forget(ll_inos[i], n))
+ ll_inos.erase(i);
+ } else if (strcmp(op, "ll_getattr") == 0) {
+ int64_t i = t.get_int();
+ struct stat attr;
+ client->ll_getattr(ll_inos[i], &attr);
+ } else if (strcmp(op, "ll_setattr") == 0) {
+ int64_t i = t.get_int();
+ struct stat attr;
+ memset(&attr, 0, sizeof(attr));
+ attr.st_mode = t.get_int();
+ attr.st_uid = t.get_int();
+ attr.st_gid = t.get_int();
+ attr.st_size = t.get_int();
+ attr.st_mtime = t.get_int();
+ attr.st_atime = t.get_int();
+ int mask = t.get_int();
+ client->ll_setattr(ll_inos[i], &attr, mask);
+ } else if (strcmp(op, "ll_readlink") == 0) {
+ int64_t i = t.get_int();
+ const char *value;
+ client->ll_readlink(ll_inos[i], &value);
+ } else if (strcmp(op, "ll_mknod") == 0) {
+ int64_t i = t.get_int();
+ const char *n = t.get_string(buf, p);
+ int m = t.get_int();
+ int r = t.get_int();
+ int64_t ri = t.get_int();
+ struct stat attr;
+ if (client->ll_mknod(ll_inos[i], n, m, r, &attr) == 0)
+ ll_inos[ri] = attr.st_ino;
+ } else if (strcmp(op, "ll_mkdir") == 0) {
+ int64_t i = t.get_int();
+ const char *n = t.get_string(buf, p);
+ int m = t.get_int();
+ int64_t ri = t.get_int();
+ struct stat attr;
+ if (client->ll_mkdir(ll_inos[i], n, m, &attr) == 0)
+ ll_inos[ri] = attr.st_ino;
+ } else if (strcmp(op, "ll_symlink") == 0) {
+ int64_t i = t.get_int();
+ const char *n = t.get_string(buf, p);
+ const char *v = t.get_string(buf, p);
+ int64_t ri = t.get_int();
+ struct stat attr;
+ if (client->ll_symlink(i, n, v, &attr) == 0)
+ ll_inos[ri] = attr.st_ino;
+ } else if (strcmp(op, "ll_unlink") == 0) {
+ int64_t i = t.get_int();
+ const char *n = t.get_string(buf, p);
+ client->ll_unlink(ll_inos[i], n);
+ } else if (strcmp(op, "ll_rmdir") == 0) {
+ int64_t i = t.get_int();
+ const char *n = t.get_string(buf, p);
+ client->ll_rmdir(ll_inos[i], n);
+ } else if (strcmp(op, "ll_rename") == 0) {
+ int64_t i = t.get_int();
+ const char *n = t.get_string(buf, p);
+ int64_t ni = t.get_int();
+ const char *nn = t.get_string(buf, p);
+ client->ll_rename(ll_inos[i], n, ll_inos[ni], nn);
+ } else if (strcmp(op, "ll_link") == 0) {
+ int64_t i = t.get_int();
+ int64_t ni = t.get_int();
+ const char *nn = t.get_string(buf, p);
+ struct stat attr;
+ client->ll_link(ll_inos[i], ni, nn, &attr);
+ } else if (strcmp(op, "ll_opendir") == 0) {
+ int64_t i = t.get_int();
+ int64_t r = t.get_int();
+ void *dirp;
+ client->ll_opendir(ll_inos[i], &dirp);
+ ll_dirs[r] = dirp;
+ } else if (strcmp(op, "ll_releasedir") == 0) {
+ int64_t f = t.get_int();
+ void *dirp = ll_dirs[f];
+ client->ll_releasedir(dirp);
+ ll_dirs.erase(f);
+ } else if (strcmp(op, "ll_open") == 0) {
+ int64_t i = t.get_int();
+ int64_t f = t.get_int();
+ int64_t r = t.get_int();
+ Fh *fhp;
+ client->ll_open(ll_inos[i], f, &fhp);
+ ll_files[r] = fhp;
+ } else if (strcmp(op, "ll_create") == 0) {
+ int64_t i = t.get_int();
+ const char *n = t.get_string(buf, p);
+ int64_t m = t.get_int();
+ int64_t f = t.get_int();
+ int64_t r = t.get_int();
+ int64_t ri = t.get_int();
+ Fh *fhp;
+ struct stat attr;
+ if (client->ll_create(ll_inos[i], n, m, f, &attr, &fhp) == 0) {
+ ll_inos[ri] = attr.st_ino;
+ ll_files[r] = fhp;
+ }
+ } else if (strcmp(op, "ll_read") == 0) {
+ int64_t f = t.get_int();
+ int64_t off = t.get_int();
+ int64_t size = t.get_int();
+ Fh *fh = ll_files[f];
+ if (!metadata_only) {
+ bufferlist bl;
+ client->ll_read(fh, off, size, &bl);
+ }
+ } else if (strcmp(op, "ll_write") == 0) {
+ int64_t f = t.get_int();
+ int64_t off = t.get_int();
+ int64_t size = t.get_int();
+ Fh *fh = ll_files[f];
+ if (!metadata_only) {
+ bufferlist bl;
+ bufferptr bp(size);
+ bl.push_back(bp);
+ bp.zero();
+ client->ll_write(fh, off, size, bl.c_str());
+ }
+ } else if (strcmp(op, "ll_release") == 0) {
+ int64_t f = t.get_int();
+ Fh *fh = ll_files[f];
+ client->ll_release(fh);
+ ll_files.erase(f);
+ }
+
+ else {
+ cout << (t.get_line()-1) << ": *** trace hit unrecognized symbol '" << op << "' " << endl;
assert(0);
- } else
- assert(0);
+ }
}
// close open files
- for (map<int64_t, int64_t>::iterator fi = open_files.begin();
+ for (hash_map<int64_t, int64_t>::iterator fi = open_files.begin();
fi != open_files.end();
fi++) {
dout(1) << "leftover close " << fi->second << endl;
if (fi->second > 0) client->close(fi->second);
}
+ for (hash_map<int64_t, DIR*>::iterator fi = open_dirs.begin();
+ fi != open_dirs.end();
+ fi++) {
+ dout(1) << "leftover closedir " << fi->second << endl;
+ if (fi->second != 0) client->closedir(fi->second);
+ }
+ for (hash_map<int64_t,Fh*>::iterator fi = ll_files.begin();
+ fi != ll_files.end();
+ fi++) {
+ dout(1) << "leftover ll_release " << fi->second << endl;
+ if (fi->second > 0) client->ll_release(fi->second);
+ }
+ for (hash_map<int64_t,void*>::iterator fi = ll_dirs.begin();
+ fi != ll_dirs.end();
+ fi++) {
+ dout(1) << "leftover ll_releasedir " << fi->second << endl;
+ if (fi->second > 0) client->ll_releasedir(fi->second);
+ }
return 0;
}
+
int SyntheticClient::clean_dir(string& basedir)
{
// read dir
- map<string, inode_t> contents;
+ list<string> contents;
int r = client->getdir(basedir.c_str(), contents);
if (r < 0) {
dout(1) << "readdir on " << basedir << " returns " << r << endl;
return r;
}
- for (map<string, inode_t>::iterator it = contents.begin();
+ for (list<string>::iterator it = contents.begin();
it != contents.end();
it++) {
- if (it->first == ".") continue;
- if (it->first == "..") continue;
- string file = basedir + "/" + it->first;
+ if (*it == ".") continue;
+ if (*it == "..") continue;
+ string file = basedir + "/" + *it;
if (time_to_stop()) break;
dirq.pop_front();
// read dir
- map<string, inode_t> contents;
+ list<string> contents;
int r = client->getdir(dir.c_str(), contents);
if (r < 0) {
dout(1) << "readdir on " << dir << " returns " << r << endl;
continue;
}
- for (map<string, inode_t>::iterator it = contents.begin();
+ for (list<string>::iterator it = contents.begin();
it != contents.end();
it++) {
- if (it->first == ".") continue;
- if (it->first == "..") continue;
- string file = dir + "/" + it->first;
+ if (*it == "." ||
+ *it == "..")
+ continue;
+ string file = dir + "/" + *it;
struct stat st;
int r = client->lstat(file.c_str(), &st);
char d[500];
dout(3) << "read_dirs " << basedir << " dirs " << dirs << " files " << files << " depth " << depth << endl;
- map<string,inode_t> contents;
+ list<string> contents;
utime_t s = g_clock.now();
int r = client->getdir(basedir, contents);
utime_t e = g_clock.now();
if (op == MDS_OP_READDIR) {
clear_dir();
- map<string, inode_t> c;
+ list<string> c;
r = client->getdir( cwd.c_str(), c );
- for (map<string, inode_t>::iterator it = c.begin();
+ for (list<string>::iterator it = c.begin();
it != c.end();
it++) {
- //dout(DBL) << " got " << it->first << endl;
- contents[it->first] = it->second;
- if (it->second.is_dir())
- subdirs.insert(it->first);
+ //dout(DBL) << " got " << *it << endl;
+ assert(0);
+ /*contents[*it] = it->second;
+ if (it->second &&
+ S_ISDIR(it->second->st_mode))
+ subdirs.insert(*it);
+ */
}
did_readdir = true;
int c = rand() % s;
char src[80];
sprintf(src, "syn.0.0/dir.%d/dir.%d/file.%d", a, b, c);
- int fd = client->open(src, O_RDONLY);
+ //int fd =
+ client->open(src, O_RDONLY);
}
return;
}
+
+
+void SyntheticClient::import_find(const char *base, const char *find, bool data)
+{
+ dout(1) << "import_find " << base << " from " << find << " data=" << data << endl;
+
+ /* use this to gather the static trace:
+ *
+ * find . -exec ls -dilsn --time-style=+%s \{\} \;
+ * or if it's wafl,
+ * find . -path ./.snapshot -prune -o -exec ls -dilsn --time-style=+%s \{\} \;
+ *
+ */
+
+ client->mkdir(base, 0755);
+
+ ifstream f(find);
+ assert(f.is_open());
+
+ while (!f.eof()) {
+ uint64_t ino;
+ int dunno, nlink;
+ string modestring;
+ int uid, gid;
+ off_t size;
+ time_t mtime;
+ string filename;
+ f >> ino;
+ if (f.eof()) break;
+ f >> dunno;
+ f >> modestring;
+ f >> nlink;
+ f >> uid;
+ f >> gid;
+ f >> size;
+ f >> mtime;
+ f.seekg(1, ios::cur);
+ getline(f, filename);
+
+ // remove leading ./
+ if (filename[0] == '.' && filename[1] == '/')
+ filename = filename.substr(2);
+
+ // parse the mode
+ assert(modestring.length() == 10);
+ mode_t mode = 0;
+ switch (modestring[0]) {
+ case 'd': mode |= INODE_MODE_DIR; break;
+ case 'l': mode |= INODE_MODE_SYMLINK; break;
+ default:
+ case '-': mode |= INODE_MODE_FILE; break;
+ }
+ if (modestring[1] == 'r') mode |= 0400;
+ if (modestring[2] == 'w') mode |= 0200;
+ if (modestring[3] == 'x') mode |= 0100;
+ if (modestring[4] == 'r') mode |= 040;
+ if (modestring[5] == 'w') mode |= 020;
+ if (modestring[6] == 'x') mode |= 010;
+ if (modestring[7] == 'r') mode |= 04;
+ if (modestring[8] == 'w') mode |= 02;
+ if (modestring[9] == 'x') mode |= 01;
+
+ dout(20) << " mode " << modestring << " to " << oct << mode << dec << endl;
+
+ if (S_ISLNK(mode)) {
+ // target vs destination
+ int pos = filename.find(" -> ");
+ assert(pos > 0);
+ string link = base;
+ link += "/";
+ link += filename.substr(0, pos);
+ string target;
+ if (filename[pos+4] == '/') {
+ target = base;
+ target += filename.substr(pos + 4);
+ } else {
+ target = filename.substr(pos + 4);
+ }
+ dout(10) << "symlink from '" << link << "' -> '" << target << "'" << endl;
+ client->symlink(target.c_str(), link.c_str());
+ } else {
+ string f = base;
+ f += "/";
+ f += filename;
+ if (S_ISDIR(mode)) {
+ client->mkdir(f.c_str(), mode);
+ } else {
+ int fd = client->open(f.c_str(), O_WRONLY|O_CREAT);
+ assert(fd > 0);
+ client->write(fd, " ", 1, size-1);
+ client->close(fd);
+
+ client->chmod(f.c_str(), mode & 0777);
+ client->chown(f.c_str(), uid, gid);
+
+ struct utimbuf ut;
+ ut.modtime = mtime;
+ ut.actime = mtime;
+ client->utime(f.c_str(), &ut);
+ }
+ }
+ }
+
+
+}
+
#define SYNCLIENT_MODE_FOO 100
#define SYNCLIENT_MODE_THRASHLINKS 101
+#define SYNCLIENT_MODE_IMPORTFIND 300
+
void parse_syn_options(vector<char*>& args);
filepath cwd;
- map<string, inode_t> contents;
+ map<string, struct stat*> contents;
set<string> subdirs;
bool did_readdir;
set<int> open_files;
r += cwd.last_dentry().c_str()[0]; // slightly permuted
r %= contents.size();
- map<string,inode_t>::iterator it = contents.begin();
+ map<string,struct stat*>::iterator it = contents.begin();
while (r--) it++;
n2 = cwd;
int exclude;
string get_sarg(int seq);
+ int get_iarg() {
+ int i = iargs.front();
+ iargs.pop_front();
+ return i;
+ }
bool time_to_stop() {
utime_t now = g_clock.now();
int clean_dir(string& basedir);
- int play_trace(Trace& t, string& prefix);
+ int play_trace(Trace& t, string& prefix, bool metadata_only=false);
void make_dir_mess(const char *basedir, int n);
void foo();
int thrash_links(const char *basedir, int dirs, int files, int depth, int n);
+ void import_find(const char *basedir, const char *find, bool writedata);
+
};
#endif
class Trace {
class TokenList *tl;
-
+ int _line;
+
public:
Trace(const char* filename);
~Trace();
-
+
+ int get_line() { return _line; }
list<const char*>& get_list();
list<const char*>::iterator _cur;
void start() {
_cur = get_list().begin();
_end = get_list().end();
- ns = 0;
+ _line = 1;
}
- char strings[10][200];
- int ns;
- const char *get_string(const char *prefix = 0) {
+ const char *get_string(char *buf, const char *prefix) {
assert(_cur != _end);
const char *s = *_cur;
- _cur++;
+ _cur++; _line++;
if (prefix) {
if (strstr(s, "/prefix") == s ||
strstr(s, "/prefix") == s+1) {
- strcpy(strings[ns], prefix);
- strcpy(strings[ns] + strlen(prefix),
+ strcpy(buf, prefix);
+ strcpy(buf + strlen(prefix),
s + strlen("/prefix"));
- s = (const char*)strings[ns];
- ns++;
- if (ns == 10) ns = 0;
+ s = (const char*)buf;
}
}
return s;
}
__int64_t get_int() {
- return atoll(get_string());
+ char buf[20];
+ return atoll(get_string(buf, 0));
}
bool end() {
return _cur == _end;
#define _XOPEN_SOURCE 500
#endif
-#define FUSE_USE_VERSION 25
+#define FUSE_USE_VERSION 26
#include <fuse.h>
#include <stdio.h>
#include "config.h"
-// stl
-#include <map>
-using namespace std;
-
-
// globals
-Client *client; // the ceph client
+static Client *client; // the ceph client
return 0;
}
-
-static int ceph_getdir(const char *path, fuse_dirh_t h, fuse_dirfil_t filler)
-{
- map<string, inode_t> contents;
-
- int res = client->getdir(path, contents);
- if (res < 0) return res;
-
- // return contents to fuse via callback
- for (map<string, inode_t>::iterator it = contents.begin();
- it != contents.end();
- it++) {
- // (immutable) inode contents too.
- res = filler(h, // fuse's handle
- it->first.c_str(), // dentry as char*
- it->second.mode & INODE_TYPE_MASK, // mask type bits from mode
- it->second.ino); // ino.. 64->32 bit issue here? FIXME
- if (res != 0) break; // fuse has had enough
- }
- return res;
-}
-
static int ceph_mknod(const char *path, mode_t mode, dev_t rdev)
{
return client->mknod(path, mode);
}
+// ------------------
+// file i/o
+
static int ceph_open(const char *path, struct fuse_file_info *fi)
{
int res;
static int ceph_read(const char *path, char *buf, size_t size, off_t offset,
struct fuse_file_info *fi)
{
- fh_t fh = fi->fh;
- return client->read(fh, buf, size, offset);
+ int fd = fi->fh;
+ return client->read(fd, buf, size, offset);
}
static int ceph_write(const char *path, const char *buf, size_t size,
off_t offset, struct fuse_file_info *fi)
{
- fh_t fh = fi->fh;
- return client->write(fh, buf, size, offset);
+ int fd = fi->fh;
+ return client->write(fd, buf, size, offset);
}
static int ceph_flush(const char *path, struct fuse_file_info *fi)
{
-//fh_t fh = fi->fh;
+ //int fh = fi->fh;
//return client->flush(fh);
return 0;
}
-
static int ceph_statfs(const char *path, struct statvfs *stbuf)
{
return client->statfs(path, stbuf);
}
-
-
static int ceph_release(const char *path, struct fuse_file_info *fi)
{
- fh_t fh = fi->fh;
- int r = client->close(fh); // close the file
+ int fd = fi->fh;
+ int r = client->close(fd); // close the file
return r;
}
static int ceph_fsync(const char *path, int isdatasync,
struct fuse_file_info *fi)
{
- fh_t fh = fi->fh;
- return client->fsync(fh, isdatasync ? true:false);
+ int fd = fi->fh;
+ return client->fsync(fd, isdatasync ? true:false);
}
+// ---------------------
+// directory i/o
+
+static int ceph_opendir(const char *path, struct fuse_file_info *fi)
+{
+ DIR *dirp;
+ int r = client->opendir(path, &dirp);
+ if (r < 0) return r;
+ fi->fh = (uint64_t)(void*)dirp;
+ return 0;
+}
+
+static int ceph_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t off, fuse_file_info *fi)
+{
+ DIR *dirp = (DIR*)fi->fh;
+
+ client->seekdir(dirp, off);
+
+ int res = 0;
+ struct dirent de;
+ struct stat st;
+ int stmask = 0;
+ while (res == 0) {
+ int r = client->readdirplus_r(dirp, &de, &st, &stmask);
+ if (r != 0) break;
+ int stneed = STAT_MASK_INO | STAT_MASK_TYPE;
+ res = filler(buf,
+ de.d_name,
+ ((stmask & stneed) == stneed) ? &st:0,
+ client->telldir(dirp));
+ }
+ return 0;
+}
+
+static int ceph_releasedir(const char *path, struct fuse_file_info *fi)
+{
+ DIR *dirp = (DIR*)fi->fh;
+ int r = client->closedir(dirp); // close the file
+ return r;
+}
+
+
+
+
+
static struct fuse_operations ceph_oper = {
getattr: ceph_getattr,
readlink: ceph_readlink,
- getdir: ceph_getdir,
+ getdir: 0,
mknod: ceph_mknod,
mkdir: ceph_mkdir,
unlink: ceph_unlink,
statfs: ceph_statfs,
flush: ceph_flush,
release: ceph_release,
- fsync: ceph_fsync
+ fsync: ceph_fsync,
+ setxattr: 0,
+ getxattr: 0,
+ listxattr: 0,
+ removexattr: 0,
+ opendir: ceph_opendir,
+ readdir: ceph_readdir,
+ releasedir: ceph_releasedir
};
// go fuse go
cout << "ok, calling fuse_main" << endl;
- int r = fuse_main(newargc, newargv, &ceph_oper);
+ int r = fuse_main(newargc, newargv, &ceph_oper, 0);
return r;
}
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#define FUSE_USE_VERSION 26
+
+#include <fuse/fuse_lowlevel.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <assert.h>
+
+// ceph
+#include "include/types.h"
+#include "Client.h"
+#include "config.h"
+
+static Client *client;
+
+
+static void ceph_ll_lookup(fuse_req_t req, fuse_ino_t parent, const char *name)
+{
+ struct fuse_entry_param fe;
+ int stmask;
+
+ memset(&fe, 0, sizeof(fe));
+ stmask = client->ll_lookup(parent, name, &fe.attr);
+ if (stmask >= 0) {
+ fe.ino = fe.attr.st_ino;
+ fuse_reply_entry(req, &fe);
+ } else {
+ fuse_reply_err(req, ENOENT);
+ }
+}
+
+static void ceph_ll_forget(fuse_req_t req, fuse_ino_t ino, long unsigned nlookup)
+{
+ client->ll_forget(ino, nlookup);
+ fuse_reply_none(req);
+}
+
+static void ceph_ll_getattr(fuse_req_t req, fuse_ino_t ino,
+ struct fuse_file_info *fi)
+{
+ struct stat stbuf;
+
+ (void) fi;
+
+ if (client->ll_getattr(ino, &stbuf) == 0)
+ fuse_reply_attr(req, &stbuf, 0);
+ else
+ fuse_reply_err(req, ENOENT);
+}
+
+static void ceph_ll_setattr(fuse_req_t req, fuse_ino_t ino, struct stat *attr,
+ int to_set, struct fuse_file_info *fi)
+{
+ int r = client->ll_setattr(ino, attr, to_set);
+ if (r == 0)
+ fuse_reply_attr(req, attr, 0);
+ else
+ fuse_reply_err(req, -r);
+}
+
+static void ceph_ll_opendir(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi)
+{
+ void *dirp;
+ int r = client->ll_opendir(ino, &dirp);
+ if (r >= 0) {
+ fi->fh = (long)dirp;
+ fuse_reply_open(req, fi);
+ } else {
+ fuse_reply_err(req, -r);
+ }
+}
+
+static void ceph_ll_readlink(fuse_req_t req, fuse_ino_t ino)
+{
+ const char *value;
+ int r = client->ll_readlink(ino, &value);
+ if (r == 0)
+ fuse_reply_readlink(req, value);
+ else
+ fuse_reply_err(req, -r);
+}
+
+static void ceph_ll_mknod(fuse_req_t req, fuse_ino_t parent, const char *name,
+ mode_t mode, dev_t rdev)
+{
+ struct fuse_entry_param fe;
+ memset(&fe, 0, sizeof(fe));
+
+ int r = client->ll_mknod(parent, name, mode, rdev, &fe.attr);
+ if (r == 0) {
+ fe.ino = fe.attr.st_ino;
+ fuse_reply_entry(req, &fe);
+ } else {
+ fuse_reply_err(req, -r);
+ }
+}
+
+static void ceph_ll_mkdir(fuse_req_t req, fuse_ino_t parent, const char *name,
+ mode_t mode)
+{
+ struct fuse_entry_param fe;
+ memset(&fe, 0, sizeof(fe));
+
+ int r = client->ll_mkdir(parent, name, mode, &fe.attr);
+ if (r == 0) {
+ fe.ino = fe.attr.st_ino;
+ fuse_reply_entry(req, &fe);
+ } else {
+ fuse_reply_err(req, -r);
+ }
+}
+
+static void ceph_ll_unlink(fuse_req_t req, fuse_ino_t parent, const char *name)
+{
+ int r = client->ll_unlink(parent, name);
+ fuse_reply_err(req, -r);
+}
+
+static void ceph_ll_rmdir(fuse_req_t req, fuse_ino_t parent, const char *name)
+{
+ int r = client->ll_rmdir(parent, name);
+ fuse_reply_err(req, -r);
+}
+
+static void ceph_ll_symlink(fuse_req_t req, const char *existing, fuse_ino_t parent, const char *name)
+{
+ struct fuse_entry_param fe;
+ memset(&fe, 0, sizeof(fe));
+
+ int r = client->ll_symlink(parent, name, existing, &fe.attr);
+ if (r == 0) {
+ fe.ino = fe.attr.st_ino;
+ fuse_reply_entry(req, &fe);
+ } else {
+ fuse_reply_err(req, -r);
+ }
+}
+
+static void ceph_ll_rename(fuse_req_t req, fuse_ino_t parent, const char *name,
+ fuse_ino_t newparent, const char *newname)
+{
+ int r = client->ll_rename(parent, name, newparent, newname);
+ fuse_reply_err(req, -r);
+}
+
+static void ceph_ll_link(fuse_req_t req, fuse_ino_t ino, fuse_ino_t newparent,
+ const char *newname)
+{
+ struct fuse_entry_param fe;
+ memset(&fe, 0, sizeof(fe));
+
+ int r = client->ll_link(ino, newparent, newname, &fe.attr);
+ if (r == 0) {
+ fe.ino = fe.attr.st_ino;
+ fuse_reply_entry(req, &fe);
+ } else {
+ fuse_reply_err(req, -r);
+ }
+}
+
+static void ceph_ll_open(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi)
+{
+ Fh *fh;
+ int r = client->ll_open(ino, fi->flags, &fh);
+ if (r == 0) {
+ fi->fh = (long)fh;
+ fuse_reply_open(req, fi);
+ } else {
+ fuse_reply_err(req, -r);
+ }
+}
+
+static void ceph_ll_read(fuse_req_t req, fuse_ino_t ino, size_t size, off_t off,
+ struct fuse_file_info *fi)
+{
+ Fh *fh = (Fh*)fi->fh;
+ bufferlist bl;
+ int r = client->ll_read(fh, off, size, &bl);
+ if (r >= 0)
+ fuse_reply_buf(req, bl.c_str(), bl.length());
+ else
+ fuse_reply_err(req, -r);
+}
+
+static void ceph_ll_write(fuse_req_t req, fuse_ino_t ino, const char *buf,
+ size_t size, off_t off, struct fuse_file_info *fi)
+{
+ Fh *fh = (Fh*)fi->fh;
+ int r = client->ll_write(fh, off, size, buf);
+ if (r >= 0)
+ fuse_reply_write(req, r);
+ else
+ fuse_reply_err(req, -r);
+}
+
+static void ceph_ll_flush(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi)
+{
+ // NOOP
+ fuse_reply_err(req, 0);
+}
+
+static void ceph_ll_release(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi)
+{
+ Fh *fh = (Fh*)fi->fh;
+ int r = client->ll_release(fh);
+ fuse_reply_err(req, -r);
+}
+
+static void ceph_ll_fsync(fuse_req_t req, fuse_ino_t ino, int datasync,
+ struct fuse_file_info *fi)
+{
+
+}
+
+static void ceph_ll_readdir(fuse_req_t req, fuse_ino_t ino, size_t size,
+ off_t off, struct fuse_file_info *fi)
+{
+ (void) fi;
+
+ // buffer
+ char *buf;
+ size_t pos = 0;
+
+ buf = new char[size];
+ if (!buf) {
+ fuse_reply_err(req, ENOMEM);
+ return;
+ }
+
+ DIR *dirp = (DIR*)fi->fh;
+ client->seekdir(dirp, off);
+
+ struct dirent de;
+ struct stat st;
+ memset(&st, 0, sizeof(st));
+
+ while (1) {
+ int r = client->readdir_r(dirp, &de);
+ if (r < 0) break;
+ st.st_ino = de.d_ino;
+ st.st_mode = DT_TO_MODE(de.d_type);
+
+ off_t off = client->telldir(dirp);
+ size_t entrysize = fuse_add_direntry(req, buf + pos, size - pos,
+ de.d_name, &st, off);
+
+ cout << "ceph_ll_readdir added " << de.d_name << " at " << pos << " len " << entrysize
+ << " (buffer size is " << size << ")"
+ << " .. off = " << off
+ << endl;
+
+ if (entrysize > size - pos)
+ break; // didn't fit, done for now.
+ pos += entrysize;
+ }
+
+ fuse_reply_buf(req, buf, pos);
+ delete[] buf;
+}
+
+static void ceph_ll_releasedir(fuse_req_t req, fuse_ino_t ino,
+ struct fuse_file_info *fi)
+{
+ DIR *dirp = (DIR*)fi->fh;
+ client->ll_releasedir(dirp);
+ fuse_reply_err(req, 0);
+}
+
+static void ceph_ll_create(fuse_req_t req, fuse_ino_t parent, const char *name,
+ mode_t mode, struct fuse_file_info *fi)
+{
+ struct fuse_entry_param fe;
+ memset(&fe, 0, sizeof(fe));
+ Fh *fh;
+ int r = client->ll_create(parent, name, mode, fi->flags, &fe.attr, &fh);
+ if (r == 0) {
+ fi->fh = (long)fh;
+ fe.ino = fe.attr.st_ino;
+ fuse_reply_create(req, &fe, fi);
+ } else {
+ fuse_reply_err(req, -r);
+ }
+}
+
+static struct fuse_lowlevel_ops ceph_ll_oper = {
+ init: 0,
+ destroy: 0,
+ lookup: ceph_ll_lookup,
+ forget: ceph_ll_forget,
+ getattr: ceph_ll_getattr,
+ setattr: ceph_ll_setattr,
+ readlink: ceph_ll_readlink,
+ mknod: ceph_ll_mknod,
+ mkdir: ceph_ll_mkdir,
+ unlink: ceph_ll_unlink,
+ rmdir: ceph_ll_rmdir,
+ symlink: ceph_ll_symlink,
+ rename: ceph_ll_rename,
+ link: ceph_ll_link,
+ open: ceph_ll_open,
+ read: ceph_ll_read,
+ write: ceph_ll_write,
+ flush: ceph_ll_flush,
+ release: ceph_ll_release,
+ fsync: ceph_ll_fsync,
+ opendir: ceph_ll_opendir,
+ readdir: ceph_ll_readdir,
+ releasedir: ceph_ll_releasedir,
+ fsyncdir: 0,
+ statfs: 0,
+ setxattr: 0,
+ getxattr: 0,
+ listxattr: 0,
+ removexattr: 0,
+ access: 0,
+ create: 0, //ceph_ll_create,
+ getlk: 0,
+ setlk: 0,
+ bmap: 0
+};
+
+int ceph_fuse_ll_main(Client *c, int argc, char *argv[])
+{
+ cout << "ceph_fuse_ll_main starting fuse" << endl;
+
+ client = c;
+
+ // set up fuse argc/argv
+ int newargc = 0;
+ char **newargv = (char **) malloc((argc + 10) * sizeof(char *));
+ newargv[newargc++] = argv[0];
+ newargv[newargc++] = "-f"; // stay in foreground
+
+ newargv[newargc++] = "-o";
+ newargv[newargc++] = "allow_other";
+
+ for (int argctr = 1; argctr < argc; argctr++) newargv[newargc++] = argv[argctr];
+
+ // go go gadget fuse
+ struct fuse_args args = FUSE_ARGS_INIT(newargc, newargv);
+ struct fuse_chan *ch;
+ char *mountpoint;
+ int err = -1;
+
+ if (fuse_parse_cmdline(&args, &mountpoint, NULL, NULL) != -1 &&
+ (ch = fuse_mount(mountpoint, &args)) != NULL) {
+ struct fuse_session *se;
+
+ // init fuse
+ se = fuse_lowlevel_new(&args, &ceph_ll_oper, sizeof(ceph_ll_oper),
+ NULL);
+ if (se != NULL) {
+ if (fuse_set_signal_handlers(se) != -1) {
+ fuse_session_add_chan(se, ch);
+ err = fuse_session_loop(se);
+ fuse_remove_signal_handlers(se);
+ fuse_session_remove_chan(ch);
+ }
+ fuse_session_destroy(se);
+ }
+ fuse_unmount(mountpoint, ch);
+ }
+ fuse_opt_free_args(&args);
+
+ cout << "ceph_fuse_ll_main done, err=" << err << endl;
+ return err ? 1 : 0;
+}
+
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+int ceph_fuse_ll_main(Client *c, int argc, char *argv[]);
*
*/
-
-
#ifndef __DECAYCOUNTER_H
#define __DECAYCOUNTER_H
#include "config.h"
+/**
+ *
+ * TODO: normalize value based on some fucntion of half_life,
+ * so that it can be interpreted as an approximation of a
+ * moving average of N seconds. currently, changing half-life
+ * skews the scale of the value, even at steady state.
+ *
+ */
+
class DecayCounter {
protected:
- double val; // value
-
- double half_life; // in seconds
- double k; // k = ln(.5)/half_life
-
+public:
+ double half_life;
+ double k; // k = ln(.5)/half_life
+ double val; // value
+ double delta; // delta since last decay
+ double vel; // recent velocity
utime_t last_decay; // time of last decay
public:
- DecayCounter() : val(0) {
+ DecayCounter() : val(0), delta(0), vel(0) {
set_halflife( g_conf.mds_decay_halflife );
reset();
}
- /*
- DecayCounter(double hl) : val(0) {
- set_halflife(hl);
+ DecayCounter(double hl) : val(0), delta(0), vel(0) {
+ set_halflife( hl );
reset();
}
- */
+
+ /**
+ * reading
+ */
+
+ double get() {
+ return get(g_clock.now());
+ }
+
+ double get(utime_t now) {
+ decay(now);
+ return val;
+ }
+
+ double get_last() {
+ return val;
+ }
+ double get_last_vel() {
+ return vel;
+ }
+
+ utime_t get_last_decay() {
+ return last_decay;
+ }
+
+ /**
+ * adjusting
+ */
+
+ void hit(utime_t now, double v = 1.0) {
+ decay(now);
+ delta += v;
+ }
+
void adjust(double a) {
- decay();
val += a;
}
- void adjust_down(const DecayCounter& other) {
- // assume other has same time stamp as us...
- val -= other.val;
+ void adjust(utime_t now, double a) {
+ decay(now);
+ val += a;
}
+ /**
+ * decay etc.
+ */
+
void set_halflife(double hl) {
half_life = hl;
k = log(.5) / hl;
}
- void take(DecayCounter& other) {
- *this = other;
- other.reset();
- }
-
void reset() {
- last_decay.sec_ref() = 0;
- last_decay.usec_ref() = 0;
- val = 0;
+ reset(g_clock.now());
+ }
+ void reset(utime_t now) {
+ last_decay = g_clock.now();
+ val = delta = 0;
}
- void decay() {
- utime_t el = g_clock.recent_now();
+ void decay(utime_t now) {
+ utime_t el = now;
el -= last_decay;
+
if (el.sec() >= 1) {
- val = val * exp((double)el * k);
- if (val < .01) val = 0;
- last_decay = g_clock.recent_now();
+ // calculate new value
+ double newval = (val+delta) * exp((double)el * k);
+ if (newval < .01) newval = 0.0;
+
+ // calculate velocity approx
+ vel += (newval - val) * (double)el;
+ vel *= exp((double)el * k);
+
+ val = newval;
+ delta = 0;
+ last_decay = now;
}
}
-
- double get() {
- decay();
- return val;
- }
-
- double hit(double v = 1.0) {
- decay();
- val += v;
- return val;
- }
-
};
if (g_conf.use_abspaths) {
char *cwd = get_current_dir_name();
filename = cwd;
- delete cwd;
+ free(cwd);
filename += "/";
}
//cout << "opening log file " << filename << endl;
}
- utime_t fromstart = g_clock.now();
+ utime_t fromstart = g_clock.recent_now();
if (fromstart < start) {
cerr << "logger time jumped backwards from " << start << " to " << fromstart << endl;
assert(0);
debug_mds: 1,
debug_mds_balancer: 1,
debug_mds_log: 1,
+ debug_mds_migrator: 1,
debug_buffer: 0,
debug_filer: 0,
debug_objecter: 0,
+ debug_journaler: 0,
debug_objectcacher: 0,
debug_client: 0,
debug_osd: 0,
mon_accept_timeout: 10.0, // on leader, if paxos update isn't accepted
mon_stop_on_last_unmount: false,
mon_stop_with_last_mds: false,
+ mon_allow_mds_bully: true, // allow a booting mds to (forcibly) claim an mds #
// --- client ---
- client_cache_size: 300,
+ client_cache_size: 1000,
client_cache_mid: .5,
client_cache_stat_ttl: 0, // seconds until cached stat results become invalid
client_cache_readdir_ttl: 1, // 1 second only
client_trace: 0,
fuse_direct_io: 0,
+ fuse_ll: true,
// --- objecter ---
- objecter_buffer_uncommitted: true,
+ objecter_buffer_uncommitted: true, // this must be true for proper failure handling
// --- journaler ---
journaler_allow_split_entries: true,
journaler_safe: false, // wait for COMMIT on journal writes
journaler_write_head_interval: 15,
+ journaler_cache: false, // cache writes for later readback
// --- mds ---
mds_cache_size: MDS_CACHE_SIZE,
mds_cache_mid: .7,
- mds_decay_halflife: 30,
+ mds_decay_halflife: 10,
mds_beacon_interval: 5, //30.0,
- mds_beacon_grace: 15, //60*60.0,
+ mds_beacon_grace: 30, //60*60.0,
mds_log: true,
mds_log_max_len: MDS_CACHE_SIZE / 3,
mds_log_subtree_map_interval: 128*1024, // frequency (in bytes) of EImportMap in log
mds_log_eopen_size: 100, // # open inodes per log entry
+ mds_bal_sample_interval: 5.0, // every 5 seconds
mds_bal_replicate_threshold: 2000,
mds_bal_unreplicate_threshold: 0,//500,
- mds_bal_hash_rd: 10000,
- mds_bal_unhash_rd: 1000,
- mds_bal_hash_wr: 10000,
- mds_bal_unhash_wr: 1000,
+ mds_bal_split_size: 1000,
+ mds_bal_split_rd: 10000,
+ mds_bal_split_wr: 10000,
+ mds_bal_merge_size: 50,
+ mds_bal_merge_rd: 1000,
+ mds_bal_merge_wr: 1000,
mds_bal_interval: 30, // seconds
- mds_bal_hash_interval: 5, // seconds
+ mds_bal_fragment_interval: 5, // seconds
mds_bal_idle_threshold: .1,
mds_bal_max: -1,
mds_bal_max_until: -1,
mds_bal_mode: 0,
+ mds_bal_min_rebalance: .2, // must be this much above average before we export anything
mds_bal_min_start: .2, // if we need less than this, we don't do anything
mds_bal_need_min: .8, // take within this range of what we need
mds_bal_need_max: 1.2,
mds_local_osd: false,
mds_thrash_exports: 0,
+ mds_thrash_fragments: 0,
mds_dump_cache_on_map: false,
mds_dump_cache_after_rejoin: true,
osd_mkfs: false,
osd_age: .8,
osd_age_time: 0,
- osd_heartbeat_interval: 5, // shut up while i'm debugging
+ osd_heartbeat_interval: 15, // shut up while i'm debugging
osd_replay_window: 5,
osd_max_pull: 2,
osd_pad_pg_log: false,
// --- fakestore ---
- fakestore_fake_sync: 2, // 2 seconds
+ fakestore_fake_sync: .5, // seconds
fakestore_fsync: false,//true,
fakestore_writesync: false,
fakestore_syncthreads: 4,
g_conf.debug_mds_log = atoi(args[++i]);
else
g_debug_after_conf.debug_mds_log = atoi(args[++i]);
+ else if (strcmp(args[i], "--debug_mds_migrator") == 0)
+ if (!g_conf.debug_after)
+ g_conf.debug_mds_migrator = atoi(args[++i]);
+ else
+ g_debug_after_conf.debug_mds_migrator = atoi(args[++i]);
else if (strcmp(args[i], "--debug_buffer") == 0)
if (!g_conf.debug_after)
g_conf.debug_buffer = atoi(args[++i]);
g_conf.debug_objecter = atoi(args[++i]);
else
g_debug_after_conf.debug_objecter = atoi(args[++i]);
+ else if (strcmp(args[i], "--debug_journaler") == 0)
+ if (!g_conf.debug_after)
+ g_conf.debug_journaler = atoi(args[++i]);
+ else
+ g_debug_after_conf.debug_journaler = atoi(args[++i]);
else if (strcmp(args[i], "--debug_objectcacher") == 0)
if (!g_conf.debug_after)
g_conf.debug_objectcacher = atoi(args[++i]);
else if (strcmp(args[i], "--journaler_safe") == 0)
g_conf.journaler_safe = atoi(args[++i]);
+ else if (strcmp(args[i], "--journaler_cache") == 0)
+ g_conf.journaler_cache = atoi(args[++i]);
else if (strcmp(args[i], "--mds_cache_size") == 0)
g_conf.mds_cache_size = atoi(args[++i]);
else if (strcmp(args[i], "--mds_bal_max_until") == 0)
g_conf.mds_bal_max_until = atoi(args[++i]);
- else if (strcmp(args[i], "--mds_bal_hash_rd") == 0)
- g_conf.mds_bal_hash_rd = atoi(args[++i]);
- else if (strcmp(args[i], "--mds_bal_hash_wr") == 0)
- g_conf.mds_bal_hash_wr = atoi(args[++i]);
- else if (strcmp(args[i], "--mds_bal_unhash_rd") == 0)
- g_conf.mds_bal_unhash_rd = atoi(args[++i]);
- else if (strcmp(args[i], "--mds_bal_unhash_wr") == 0)
- g_conf.mds_bal_unhash_wr = atoi(args[++i]);
+ else if (strcmp(args[i], "--mds_bal_split_size") == 0)
+ g_conf.mds_bal_split_size = atoi(args[++i]);
+ else if (strcmp(args[i], "--mds_bal_split_rd") == 0)
+ g_conf.mds_bal_split_rd = atoi(args[++i]);
+ else if (strcmp(args[i], "--mds_bal_split_wr") == 0)
+ g_conf.mds_bal_split_wr = atoi(args[++i]);
+ else if (strcmp(args[i], "--mds_bal_merge_size") == 0)
+ g_conf.mds_bal_merge_size = atoi(args[++i]);
+ else if (strcmp(args[i], "--mds_bal_merge_rd") == 0)
+ g_conf.mds_bal_merge_rd = atoi(args[++i]);
+ else if (strcmp(args[i], "--mds_bal_merge_wr") == 0)
+ g_conf.mds_bal_merge_wr = atoi(args[++i]);
else if (strcmp(args[i], "--mds_bal_mode") == 0)
g_conf.mds_bal_mode = atoi(args[++i]);
g_conf.mds_local_osd = atoi(args[++i]);
else if (strcmp(args[i], "--mds_thrash_exports") == 0)
g_conf.mds_thrash_exports = atoi(args[++i]);
+ else if (strcmp(args[i], "--mds_thrash_fragments") == 0)
+ g_conf.mds_thrash_fragments = atoi(args[++i]);
else if (strcmp(args[i], "--mds_dump_cache_on_map") == 0)
g_conf.mds_dump_cache_on_map = true;
else if (strcmp(args[i], "--client_cache_readdir_ttl") == 0)
g_conf.client_cache_readdir_ttl = atoi(args[++i]);
else if (strcmp(args[i], "--client_trace") == 0)
- g_conf.client_trace = atoi(args[++i]);
+ g_conf.client_trace = args[++i];
+
else if (strcmp(args[i], "--fuse_direct_io") == 0)
g_conf.fuse_direct_io = atoi(args[++i]);
+ else if (strcmp(args[i], "--fuse_ll") == 0)
+ g_conf.fuse_ll = atoi(args[++i]);
else if (strcmp(args[i], "--mon_osd_down_out_interval") == 0)
g_conf.mon_osd_down_out_interval = atoi(args[++i]);
int debug_mds;
int debug_mds_balancer;
int debug_mds_log;
+ int debug_mds_migrator;
int debug_buffer;
int debug_filer;
int debug_objecter;
+ int debug_journaler;
int debug_objectcacher;
int debug_client;
int debug_osd;
float mon_accept_timeout;
bool mon_stop_on_last_unmount;
bool mon_stop_with_last_mds;
+ bool mon_allow_mds_bully;
// client
int client_cache_size;
size_t client_bcache_align;
*/
- int client_trace;
+ char *client_trace;
int fuse_direct_io;
+ bool fuse_ll;
// objecter
bool objecter_buffer_uncommitted;
bool journaler_allow_split_entries;
bool journaler_safe;
int journaler_write_head_interval;
+ bool journaler_cache;
// mds
int mds_cache_size;
off_t mds_log_subtree_map_interval;
int mds_log_eopen_size;
+ float mds_bal_sample_interval;
float mds_bal_replicate_threshold;
float mds_bal_unreplicate_threshold;
- float mds_bal_hash_rd;
- float mds_bal_unhash_rd;
- float mds_bal_hash_wr;
- float mds_bal_unhash_wr;
+ int mds_bal_split_size;
+ float mds_bal_split_rd;
+ float mds_bal_split_wr;
+ int mds_bal_merge_size;
+ float mds_bal_merge_rd;
+ float mds_bal_merge_wr;
int mds_bal_interval;
- int mds_bal_hash_interval;
+ int mds_bal_fragment_interval;
float mds_bal_idle_threshold;
int mds_bal_max;
int mds_bal_max_until;
int mds_bal_mode;
+ float mds_bal_min_rebalance;
float mds_bal_min_start;
float mds_bal_need_min;
float mds_bal_need_max;
bool mds_local_osd;
int mds_thrash_exports;
+ int mds_thrash_fragments;
bool mds_dump_cache_on_map;
bool mds_dump_cache_after_rejoin;
int osd_max_pull;
bool osd_pad_pg_log;
- int fakestore_fake_sync;
+ double fakestore_fake_sync;
bool fakestore_fsync;
bool fakestore_writesync;
int fakestore_syncthreads; // such crap
#undef dout
-#define dout(x) if (x <= g_conf.debug_ebofs) cout << "ebofs(" << fs->dev.get_device_name() << ").allocator."
+#define dout(x) if (x <= g_conf.debug_ebofs) cout << g_clock.now() << " ebofs(" << fs->dev.get_device_name() << ").allocator."
void Allocator::dump_freelist()
*/
#undef dout
-#define dout(x) if (x <= g_conf.debug_bdev) cout << "bdev(" << dev << ").elevatorq."
-#define derr(x) if (x <= g_conf.debug_bdev) cerr << "bdev(" << dev << ").elevatorq."
+#define dout(x) if (x <= g_conf.debug_bdev) cout << g_clock.now() << " bdev(" << dev << ").elevatorq."
+#define derr(x) if (x <= g_conf.debug_bdev) cerr << g_clock.now() << " bdev(" << dev << ").elevatorq."
int BlockDevice::ElevatorQueue::dequeue_io(list<biovec*>& biols,
* BarrierQueue
*/
#undef dout
-#define dout(x) if (x <= g_conf.debug_bdev) cout << "bdev(" << dev << ").barrierq."
+#define dout(x) if (x <= g_conf.debug_bdev) cout << g_clock.now() << " bdev(" << dev << ").barrierq."
void BlockDevice::BarrierQueue::barrier()
{
*/
#undef dout
-#define dout(x) if (x <= g_conf.debug_bdev) cout << "bdev(" << dev << ")."
+#define dout(x) if (x <= g_conf.debug_bdev) cout << g_clock.now() << " bdev(" << dev << ")."
r = ioctl(fd, BLKGETSIZE64, &bytes);
num_blocks = bytes / (uint64_t)EBOFS_BLOCK_SIZE;
if (r == 0) {
- dout(1) << "get_num_blocks ioctl BLKGETSIZE64 reports "
- << num_blocks << " 4k blocks, "
- << bytes << " bytes"
- << endl;
+ dout(10) << "get_num_blocks ioctl BLKGETSIZE64 reports "
+ << num_blocks << " 4k blocks, "
+ << bytes << " bytes"
+ << endl;
#else
// hrm, try the 32 bit ioctl?
unsigned long sectors = 0;
num_blocks = sectors/8ULL;
bytes = sectors*512ULL;
if (r == 0) {
- dout(1) << "get_num_blocks ioctl BLKGETSIZE reports " << sectors << " sectors, "
- << num_blocks << " 4k blocks, " << bytes << " bytes" << endl;
+ dout(10) << "get_num_blocks ioctl BLKGETSIZE reports " << sectors << " sectors, "
+ << num_blocks << " 4k blocks, " << bytes << " bytes" << endl;
#endif
} else {
// hmm, try stat!
fstat(fd, &st);
uint64_t bytes = st.st_size;
num_blocks = bytes / EBOFS_BLOCK_SIZE;
- dout(1) << "get_num_blocks stat reports " << num_blocks << " 4k blocks, " << bytes << " bytes" << endl;
+ dout(10) << "get_num_blocks stat reports " << num_blocks << " 4k blocks, " << bytes << " bytes" << endl;
}
if (g_conf.bdev_fake_mb) {
#undef dout
-#define dout(x) if (x <= g_conf.debug_ebofs) cout << "ebofs.bh."
+#define dout(x) if (x <= g_conf.debug_ebofs) cout << g_clock.now() << " ebofs.bh."
#undef dout
-#define dout(x) if (x <= g_conf.debug_ebofs) cout << "ebofs.oc."
+#define dout(x) if (x <= g_conf.debug_ebofs) cout << g_clock.now() << " ebofs.oc."
/************** BufferCache ***************/
#undef dout
-#define dout(x) if (x <= g_conf.debug_ebofs) cout << "ebofs.bc."
+#define dout(x) if (x <= g_conf.debug_ebofs) cout << g_clock.now() << " ebofs.bc."
// *******************
#undef dout
-#define dout(x) if (x <= g_conf.debug_ebofs) cout << "ebofs(" << dev.get_device_name() << ")."
-#define derr(x) if (x <= g_conf.debug_ebofs) cerr << "ebofs(" << dev.get_device_name() << ")."
+#define dout(x) if (x <= g_conf.debug_ebofs) cout << g_clock.now() << " ebofs(" << dev.get_device_name() << ")."
+#define derr(x) if (x <= g_conf.debug_ebofs) cerr << g_clock.now() << " ebofs(" << dev.get_device_name() << ")."
char *nice_blocks(block_t b)
if (journalfn) {
journal = new FileJournal(this, journalfn);
if (journal->open() < 0) {
- dout(-3) << "mount journal " << journalfn << " open failed" << endl;
+ dout(3) << "mount journal " << journalfn << " open failed" << endl;
delete journal;
journal = 0;
} else {
- dout(-3) << "mount journal " << journalfn << " opened, replaying" << endl;
+ dout(3) << "mount journal " << journalfn << " opened, replaying" << endl;
while (1) {
bufferlist bl;
epoch_t e;
if (!journal->read_entry(bl, e)) {
- dout(-3) << "mount replay: end of journal, done." << endl;
+ dout(3) << "mount replay: end of journal, done." << endl;
break;
}
if (e < super_epoch) {
- dout(-3) << "mount replay: skipping old entry in epoch " << e << " < " << super_epoch << endl;
+ dout(3) << "mount replay: skipping old entry in epoch " << e << " < " << super_epoch << endl;
continue;
}
if (e == super_epoch+1) {
super_epoch++;
- dout(-3) << "mount replay: jumped to next epoch " << super_epoch << endl;
+ dout(3) << "mount replay: jumped to next epoch " << super_epoch << endl;
}
assert(e == super_epoch);
- dout(-3) << "mount replay: applying transaction in epoch " << e << endl;
+ dout(3) << "mount replay: applying transaction in epoch " << e << endl;
Transaction t;
int off = 0;
t._decode(bl, off);
_apply_transaction(t);
}
+
+ // done reading, make writeable.
+ journal->make_writeable();
}
}
commit_thread.create();
finisher_thread.create();
- dout(1) << "mounted " << dev.get_device_name() << " " << dev.get_num_blocks() << " blocks, " << nice_blocks(dev.get_num_blocks()) << endl;
+ dout(1) << "mounted " << dev.get_device_name() << " " << dev.get_num_blocks() << " blocks, " << nice_blocks(dev.get_num_blocks())
+ << (journal ? ", with journal":", no journal")
+ << endl;
mounted = true;
block_t num_blocks = dev.get_num_blocks();
// make a super-random fsid
+ srand48(time(0) ^ getpid());
+ super_fsid = ((uint64_t)lrand48() << 32) ^ mrand48();
srand(time(0) ^ getpid());
- super_fsid = (lrand48() << 32) ^ mrand48();
+ super_fsid ^= rand();
+ super_fsid ^= (uint64_t)rand() << 32;
free_blocks = 0;
limbo_blocks = 0;
// create journal?
if (journalfn) {
- journal = new FileJournal(this, journalfn);
+ Journal *journal = new FileJournal(this, journalfn);
if (journal->create() < 0) {
dout(3) << "mount journal " << journalfn << " created failed" << endl;
- delete journal;
} else {
dout(3) << "mount journal " << journalfn << " created" << endl;
}
+ delete journal;
}
dout(2) << "mkfs: " << dev.get_device_name() << " " << dev.get_num_blocks() << " blocks, " << nice_blocks(dev.get_num_blocks()) << endl;
if (bl.length() == 0) {
zleft += len;
left = 0;
+ } else {
+ assert(bl.length() == len);
}
if (zleft)
- dout(10) << "apply_write zeroing first " << zleft << " bytes of " << *on << endl;
+ dout(10) << "apply_write zeroing " << zleft << " bytes before " << off << "~" << len
+ << " in " << *on << endl;
block_t blast = (len+off-1) / EBOFS_BLOCK_SIZE;
block_t blen = blast-bstart+1;
bufferlist zb;
zb.push_back(zp);
bh->add_partial(off_in_bh, zb);
- zleft -= z;
+ zleft -= z;
opos += z;
}
unsigned Ebofs::apply_transaction(Transaction& t, Context *onsafe)
{
ebofs_lock.Lock();
- dout(7) << "apply_transaction start (" << t.ops.size() << " ops)" << endl;
+ dout(7) << "apply_transaction start (" << t.get_num_ops() << " ops)" << endl;
unsigned r = _apply_transaction(t);
// do ops
unsigned r = 0; // bit fields indicate which ops failed.
int bit = 1;
- for (list<int>::iterator p = t.ops.begin();
- p != t.ops.end();
- p++) {
- switch (*p) {
+ while (t.have_op()) {
+ int op = t.get_op();
+ switch (op) {
case Transaction::OP_READ:
{
- object_t oid = t.oids.front(); t.oids.pop_front();
- off_t offset = t.offsets.front(); t.offsets.pop_front();
- size_t len = t.lengths.front(); t.lengths.pop_front();
- bufferlist *pbl = t.pbls.front(); t.pbls.pop_front();
+ object_t oid;
+ t.get_oid(oid);
+ off_t offset, len;
+ t.get_length(offset);
+ t.get_length(len);
+ bufferlist *pbl;
+ t.get_pbl(pbl);
if (_read(oid, offset, len, *pbl) < 0) {
dout(7) << "apply_transaction fail on _read" << endl;
r &= bit;
case Transaction::OP_STAT:
{
- object_t oid = t.oids.front(); t.oids.pop_front();
- struct stat *st = t.psts.front(); t.psts.pop_front();
+ object_t oid;
+ t.get_oid(oid);
+ struct stat *st;
+ t.get_pstat(st);
if (_stat(oid, st) < 0) {
dout(7) << "apply_transaction fail on _stat" << endl;
r &= bit;
case Transaction::OP_GETATTR:
{
- object_t oid = t.oids.front(); t.oids.pop_front();
- const char *attrname = t.get_attrname(); t.pop_attrname();
- pair<void*,int*> pattrval = t.pattrvals.front(); t.pattrvals.pop_front();
+ object_t oid;
+ t.get_oid(oid);
+ const char *attrname;
+ t.get_attrname(attrname);
+ pair<void*,int*> pattrval;
+ t.get_pattrval(pattrval);
if ((*(pattrval.second) = _getattr(oid, attrname, pattrval.first, *(pattrval.second))) < 0) {
dout(7) << "apply_transaction fail on _getattr" << endl;
r &= bit;
case Transaction::OP_GETATTRS:
{
- object_t oid = t.oids.front(); t.oids.pop_front();
- map<string,bufferptr> *pset = t.pattrsets.front(); t.pattrsets.pop_front();
+ object_t oid;
+ t.get_oid(oid);
+ map<string,bufferptr> *pset;
+ t.get_pattrset(pset);
if (_getattrs(oid, *pset) < 0) {
dout(7) << "apply_transaction fail on _getattrs" << endl;
r &= bit;
case Transaction::OP_WRITE:
{
- object_t oid = t.oids.front(); t.oids.pop_front();
- off_t offset = t.offsets.front(); t.offsets.pop_front();
- size_t len = t.lengths.front(); t.lengths.pop_front();
- bufferlist bl = t.bls.front(); t.bls.pop_front();
+ object_t oid;
+ t.get_oid(oid);
+ off_t offset, len;
+ t.get_length(offset);
+ t.get_length(len);
+ bufferlist bl;
+ t.get_bl(bl);
if (_write(oid, offset, len, bl) < 0) {
dout(7) << "apply_transaction fail on _write" << endl;
r &= bit;
case Transaction::OP_TRIMCACHE:
{
- object_t oid = t.oids.front(); t.oids.pop_front();
- off_t offset = t.offsets.front(); t.offsets.pop_front();
- size_t len = t.lengths.front(); t.lengths.pop_front();
+ object_t oid;
+ t.get_oid(oid);
+ off_t offset, len;
+ t.get_length(offset);
+ t.get_length(len);
_trim_from_cache(oid, offset, len);
}
break;
case Transaction::OP_TRUNCATE:
{
- object_t oid = t.oids.front(); t.oids.pop_front();
- off_t len = t.offsets.front(); t.offsets.pop_front();
+ object_t oid;
+ t.get_oid(oid);
+ off_t len;
+ t.get_length(len);
if (_truncate(oid, len) < 0) {
dout(7) << "apply_transaction fail on _truncate" << endl;
r &= bit;
case Transaction::OP_REMOVE:
{
- object_t oid = t.oids.front(); t.oids.pop_front();
+ object_t oid;
+ t.get_oid(oid);
if (_remove(oid) < 0) {
dout(7) << "apply_transaction fail on _remove" << endl;
r &= bit;
case Transaction::OP_SETATTR:
{
- object_t oid = t.oids.front(); t.oids.pop_front();
- const char *attrname = t.get_attrname(); t.pop_attrname();
- //pair<const void*,int> attrval = t.attrvals.front(); t.attrvals.pop_front();
+ object_t oid;
+ t.get_oid(oid);
+ const char *attrname;
+ t.get_attrname(attrname);
bufferlist bl;
- bl.claim( t.attrbls.front() );
- t.attrbls.pop_front();
+ t.get_bl(bl);
if (_setattr(oid, attrname, bl.c_str(), bl.length()) < 0) {
dout(7) << "apply_transaction fail on _setattr" << endl;
r &= bit;
case Transaction::OP_SETATTRS:
{
- object_t oid = t.oids.front(); t.oids.pop_front();
- map<string,bufferptr> *pattrset = t.pattrsets.front(); t.pattrsets.pop_front();
+ object_t oid;
+ t.get_oid(oid);
+ map<string,bufferptr> *pattrset;
+ t.get_pattrset(pattrset);
if (_setattrs(oid, *pattrset) < 0) {
dout(7) << "apply_transaction fail on _setattrs" << endl;
r &= bit;
case Transaction::OP_RMATTR:
{
- object_t oid = t.oids.front(); t.oids.pop_front();
- const char *attrname = t.get_attrname(); t.pop_attrname();
+ object_t oid;
+ t.get_oid(oid);
+ const char *attrname;
+ t.get_attrname(attrname);
if (_rmattr(oid, attrname) < 0) {
dout(7) << "apply_transaction fail on _rmattr" << endl;
r &= bit;
case Transaction::OP_CLONE:
{
- object_t oid = t.oids.front(); t.oids.pop_front();
- object_t noid = t.oids.front(); t.oids.pop_front();
+ object_t oid;
+ t.get_oid(oid);
+ object_t noid;
+ t.get_oid(noid);
if (_clone(oid, noid) < 0) {
dout(7) << "apply_transaction fail on _clone" << endl;
r &= bit;
case Transaction::OP_MKCOLL:
{
- coll_t cid = t.cids.front(); t.cids.pop_front();
+ coll_t cid;
+ t.get_cid(cid);
if (_create_collection(cid) < 0) {
dout(7) << "apply_transaction fail on _create_collection" << endl;
r &= bit;
case Transaction::OP_RMCOLL:
{
- coll_t cid = t.cids.front(); t.cids.pop_front();
+ coll_t cid;
+ t.get_cid(cid);
if (_destroy_collection(cid) < 0) {
dout(7) << "apply_transaction fail on _destroy_collection" << endl;
r &= bit;
case Transaction::OP_COLL_ADD:
{
- coll_t cid = t.cids.front(); t.cids.pop_front();
- object_t oid = t.oids.front(); t.oids.pop_front();
+ coll_t cid;
+ t.get_cid(cid);
+ object_t oid;
+ t.get_oid(oid);
if (_collection_add(cid, oid) < 0) {
//dout(7) << "apply_transaction fail on _collection_add" << endl;
//r &= bit;
case Transaction::OP_COLL_REMOVE:
{
- coll_t cid = t.cids.front(); t.cids.pop_front();
- object_t oid = t.oids.front(); t.oids.pop_front();
+ coll_t cid;
+ t.get_cid(cid);
+ object_t oid;
+ t.get_oid(oid);
if (_collection_remove(cid, oid) < 0) {
dout(7) << "apply_transaction fail on _collection_remove" << endl;
r &= bit;
case Transaction::OP_COLL_SETATTR:
{
- coll_t cid = t.cids.front(); t.cids.pop_front();
- const char *attrname = t.get_attrname(); t.pop_attrname();
- //pair<const void*,int> attrval = t.attrvals.front(); t.attrvals.pop_front();
+ coll_t cid;
+ t.get_cid(cid);
+ const char *attrname;
+ t.get_attrname(attrname);
bufferlist bl;
- bl.claim( t.attrbls.front() );
- t.attrbls.pop_front();
+ t.get_bl(bl);
if (_collection_setattr(cid, attrname, bl.c_str(), bl.length()) < 0) {
//if (_collection_setattr(cid, attrname, attrval.first, attrval.second) < 0) {
dout(7) << "apply_transaction fail on _collection_setattr" << endl;
case Transaction::OP_COLL_RMATTR:
{
- coll_t cid = t.cids.front(); t.cids.pop_front();
- const char *attrname = t.get_attrname(); t.pop_attrname();
+ coll_t cid;
+ t.get_cid(cid);
+ const char *attrname;
+ t.get_attrname(attrname);
if (_collection_rmattr(cid, attrname) < 0) {
dout(7) << "apply_transaction fail on _collection_rmattr" << endl;
r &= bit;
break;
default:
- cerr << "bad op " << *p << endl;
+ cerr << "bad op " << op << endl;
assert(0);
}
#include "config.h"
#undef dout
-#define dout(x) if (x <= g_conf.debug_ebofs) cout << "ebofs(" << ebofs->dev.get_device_name() << ").journal "
-#define derr(x) if (x <= g_conf.debug_ebofs) cerr << "ebofs(" << ebofs->dev.get_device_name() << ").journal "
+#define dout(x) if (x <= g_conf.debug_ebofs) cout << g_clock.now() << " ebofs(" << ebofs->dev.get_device_name() << ").journal "
+#define derr(x) if (x <= g_conf.debug_ebofs) cerr << g_clock.now() << " ebofs(" << ebofs->dev.get_device_name() << ").journal "
int FileJournal::create()
{
- dout(1) << "create " << fn << endl;
+ dout(2) << "create " << fn << endl;
// open/create
fd = ::open(fn.c_str(), O_RDWR|O_SYNC);
if (fd < 0) {
- dout(1) << "create failed " << errno << " " << strerror(errno) << endl;
+ dout(2) << "create failed " << errno << " " << strerror(errno) << endl;
return -errno;
}
assert(fd > 0);
// get size
struct stat st;
::fstat(fd, &st);
- dout(1) << "open " << fn << " " << st.st_size << " bytes" << endl;
+ dout(2) << "create " << fn << " " << st.st_size << " bytes" << endl;
// write empty header
memset(&header, 0, sizeof(header));
assert(fd == 0);
fd = ::open(fn.c_str(), O_RDWR|O_SYNC);
if (fd < 0) {
- dout(1) << "open failed " << errno << " " << strerror(errno) << endl;
+ dout(2) << "open failed " << errno << " " << strerror(errno) << endl;
return -errno;
}
assert(fd > 0);
// read header?
read_header();
- if (header.num > 0 && header.fsid == ebofs->get_fsid()) {
+ if (header.fsid != ebofs->get_fsid()) {
+ dout(2) << "open journal fsid doesn't match, invalid (someone else's?) journal" << endl;
+ }
+ else if (header.num > 0) {
// valid header, pick an offset
for (int i=0; i<header.num; i++) {
if (header.epoch[i] == ebofs->get_super_epoch()) {
bool FileJournal::read_entry(bufferlist& bl, epoch_t& epoch)
{
if (!read_pos) {
- dout(1) << "read_entry -- not readable" << endl;
- make_writeable();
+ dout(2) << "read_entry -- not readable" << endl;
return false;
}
::lseek(fd, read_pos, SEEK_SET);
::read(fd, &h, sizeof(h));
if (!h.check_magic(read_pos, header.fsid)) {
- dout(1) << "read_entry " << read_pos << " : bad header magic, end of journal" << endl;
- make_writeable();
+ dout(2) << "read_entry " << read_pos << " : bad header magic, end of journal" << endl;
return false;
}
if (!f.check_magic(read_pos, header.fsid) ||
h.epoch != f.epoch ||
h.len != f.len) {
- dout(1) << "read_entry " << read_pos << " : bad footer magic, partially entry, end of journal" << endl;
- make_writeable();
+ dout(2) << "read_entry " << read_pos << " : bad footer magic, partially entry, end of journal" << endl;
return false;
}
void stop_writer();
void write_thread_entry();
- void make_writeable();
-
class Writer : public Thread {
FileJournal *journal;
public:
int open();
void close();
+ void make_writeable();
+
// writes
bool submit_entry(bufferlist& e, Context *oncommit); // submit an item
void commit_epoch_start(); // mark epoch boundary
virtual void close() = 0;
// writes
+ virtual void make_writeable() = 0;
virtual bool submit_entry(bufferlist& e, Context *oncommit) = 0;// submit an item
virtual void commit_epoch_start() = 0; // mark epoch boundary
virtual void commit_epoch_finish() = 0; // mark prior epoch as committed (we can expire)
#include "osd/OSD.h"
#include "client/Client.h"
#include "client/fuse.h"
+#include "client/fuse_ll.h"
#include "common/Timer.h"
int main(int argc, char **argv) {
cerr << "fakefuse starting" << endl;
+ // stop on our own (by default)
+ g_conf.mon_stop_on_last_unmount = true;
+ g_conf.mon_stop_with_last_mds = true;
+
vector<char*> args;
argv_to_vec(argc, argv, args);
parse_config_options(args);
if (g_conf.clock_tare) g_clock.tare();
MonMap *monmap = new MonMap(g_conf.num_mon);
-
+ entity_addr_t a;
+ a.nonce = getpid();
+ for (int i=0; i<g_conf.num_mon; i++) {
+ a.port = i;
+ monmap->mon_inst[i] = entity_inst_t(MSG_ADDR_MON(i), a); // hack ; see FakeMessenger.cc
+ }
+
Monitor *mon[g_conf.num_mon];
for (int i=0; i<g_conf.num_mon; i++) {
mon[i] = new Monitor(i, new FakeMessenger(MSG_ADDR_MON(i)), monmap);
mds[i] = new MDS(i, new FakeMessenger(MSG_ADDR_MDS(i)), monmap);
}
- // init
- for (int i=0; i<g_conf.num_mon; i++) {
+ // init
+ for (int i=0; i<g_conf.num_mon; i++)
mon[i]->init();
- }
- for (int i=0; i<NUMMDS; i++) {
- mds[i]->init();
- }
-
- for (int i=0; i<NUMOSD; i++) {
+ for (int i=0; i<NUMMDS; i++)
+ mds[i]->init();
+ for (int i=0; i<NUMOSD; i++)
osd[i]->init();
- }
// create client
// start up fuse
// use my argc, argv (make sure you pass a mount point!)
- cout << "starting fuse on pid " << getpid() << endl;
client[i]->mount();
char *oldcwd = get_current_dir_name(); // note previous wd
- ceph_fuse_main(client[i], argc, argv);
+ cout << "starting fuse on pid " << getpid() << endl;
+ if (g_conf.fuse_ll)
+ ceph_fuse_ll_main(client[i], argc, argv);
+ else
+ ceph_fuse_main(client[i], argc, argv);
+ cout << "fuse finished on pid " << getpid() << endl;
::chdir(oldcwd); // return to previous wd
+ free(oldcwd);
client[i]->unmount();
- cout << "fuse finished on pid " << getpid() << endl;
client[i]->shutdown();
}
MonMap *monmap = new MonMap(g_conf.num_mon);
entity_addr_t a;
+ a.nonce = getpid();
for (int i=0; i<g_conf.num_mon; i++) {
a.port = i;
monmap->mon_inst[i] = entity_inst_t(MSG_ADDR_MON(i), a); // hack ; see FakeMessenger.cc
OSD *mdsosd[g_conf.num_mds];
for (int i=0; i<g_conf.num_mds; i++) {
//cerr << "mds" << i << " on rank " << myrank << " " << hostname << "." << pid << endl;
- mds[i] = new MDS(-1, new FakeMessenger(MSG_ADDR_MDS_NEW), monmap);
+ mds[i] = new MDS(-1, new FakeMessenger(MSG_ADDR_MDS(i)), monmap);
if (g_conf.mds_local_osd)
mdsosd[i] = new OSD(i+10000, new FakeMessenger(MSG_ADDR_OSD(i+10000)), monmap);
start++;
#include "common/Mutex.h"
+
+//#define BUFFER_USE_CCPP
+
+#ifdef BUFFER_USE_CCPP
+# include "cc++/thread.h"
+#endif
+
#include <iostream>
#include <list>
extern long buffer_total_alloc;
// </hack>
+
+
+
class buffer {
private:
public:
char *data;
unsigned len;
+#ifdef BUFFER_USE_CCPP
+ mutable ost::AtomicCounter nref; // mutable for const-ness of operator<<
+#else
int nref;
Mutex lock; // we'll make it non-recursive.
+#endif
- raw(unsigned l) : len(l), nref(0), lock(false) {}
- raw(char *c, unsigned l) : data(c), len(l), nref(0), lock(false) {}
+ raw(unsigned l) : len(l), nref(0)
+#ifndef BUFFER_USE_CCPP
+ , lock(false)
+#endif
+ { }
+ raw(char *c, unsigned l) : data(c), len(l), nref(0)
+#ifndef BUFFER_USE_CCPP
+ , lock(false)
+#endif
+ { }
virtual ~raw() {};
// no copying.
static raw* create_page_aligned(unsigned len) {
#ifndef __CYGWIN__
- return new raw_mmap_pages(len);
+ //return new raw_mmap_pages(len);
+ return new raw_posix_aligned(len);
#else
return new raw_hack_aligned(len);
#endif
}
ptr(const ptr& p) : _raw(p._raw), _off(p._off), _len(p._len) {
if (_raw) {
- _raw->lock.Lock();
+#ifdef BUFFER_USE_CCPP
++_raw->nref;
- _raw->lock.Unlock();
+#else
+ _raw->lock.Lock();
+ ++_raw->nref;
+ _raw->lock.Unlock();
+#endif
}
}
ptr(const ptr& p, unsigned o, unsigned l) : _raw(p._raw), _off(p._off + o), _len(l) {
assert(o+l <= p._len);
assert(_raw);
+#ifdef BUFFER_USE_CCPP
+ ++_raw->nref;
+#else
_raw->lock.Lock();
++_raw->nref;
_raw->lock.Unlock();
+#endif
}
ptr& operator= (const ptr& p) {
// be careful -- we need to properly handle self-assignment.
if (p._raw) {
+#ifdef BUFFER_USE_CCPP
+ ++p._raw->nref; // inc new
+#else
p._raw->lock.Lock();
++p._raw->nref; // inc new
p._raw->lock.Unlock();
+#endif
}
release(); // dec (+ dealloc) old (if any)
_raw = p._raw; // change my ref
void release() {
if (_raw) {
+#ifndef BUFFER_USE_CCPP
_raw->lock.Lock();
- if (--_raw->nref == 0) {
- //cout << "hosing raw " << (void*)_raw << " len " << _raw->len << std::endl;
- _raw->lock.Unlock();
- delete _raw; // dealloc old (if any)
- } else
- _raw->lock.Unlock();
+#endif
+ if (--_raw->nref == 0) {
+ //cout << "hosing raw " << (void*)_raw << " len " << _raw->len << std::endl;
+#ifndef BUFFER_USE_CCPP
+ _raw->lock.Unlock();
+#endif
+ delete _raw; // dealloc old (if any)
+ } else {
+#ifndef BUFFER_USE_CCPP
+ _raw->lock.Unlock();
+#endif
+ }
_raw = 0;
}
}
}
unsigned length() const {
-#if 1
+#if 0
// DEBUG: verify _len
unsigned len = 0;
for (std::list<ptr>::const_iterator it = _buffers.begin();
void append(const char *data, unsigned len) {
while (len > 0) {
// put what we can into the existing append_buffer.
- if (append_buffer.unused_tail_length() > 0) {
- unsigned gap = append_buffer.unused_tail_length();
+ unsigned gap = append_buffer.unused_tail_length();
+ if (gap > 0) {
if (gap > len) gap = len;
append_buffer.append(data, gap);
append(append_buffer, append_buffer.end() - gap, gap); // add segment to the list
template<class T>
inline void _encode(const std::list<T>& ls, bufferlist& bl)
{
- uint32_t n = ls.size();
- _encoderaw(n, bl);
- for (typename std::list<T>::const_iterator p = ls.begin(); p != ls.end(); ++p)
- _encode(*p, bl);
+ // should i pre- or post- count?
+ if (!ls.empty()) {
+ unsigned pos = bl.length();
+ uint32_t n = 0;
+ _encoderaw(n, bl);
+ for (typename std::list<T>::const_iterator p = ls.begin(); p != ls.end(); ++p) {
+ n++;
+ _encode(*p, bl);
+ }
+ bl.copy_in(pos, sizeof(n), (char*)&n);
+ } else {
+ uint32_t n = ls.size(); // FIXME: this is slow on a list.
+ _encoderaw(n, bl);
+ for (typename std::list<T>::const_iterator p = ls.begin(); p != ls.end(); ++p)
+ _encode(*p, bl);
+ }
}
template<class T>
inline void _decode(std::list<T>& ls, bufferlist& bl, int& off)
_encoderaw(len, bl);
bl.append(s);
}
+inline void _encode_destructively(bufferlist& s, bufferlist& bl)
+{
+ uint32_t len = s.length();
+ _encoderaw(len, bl);
+ bl.claim_append(s);
+}
inline void _decode(bufferlist& s, bufferlist& bl, int& off)
{
uint32_t len;
#include <stdint.h>
#include <map>
#include <list>
+#include <iostream>
#include "buffer.h"
/*
* TODO:
* - get_first_child(), next_sibling(int parent_bits) to make (possibly partial)
* iteration efficient (see, e.g., try_assimilate_children()
+ * - rework frag_t so that we mask the left-most (most significant) bits instead of
+ * the right-most (least significant) bits. just because it's more intutive, and
+ * matches the network/netmask concept.
*/
typedef uint32_t _frag_t;
public:
frag_t() : _enc(0) { }
frag_t(unsigned v, unsigned b) : _enc((b << 24) +
- (v & (0xffffffff >> b))) { }
+ (v & (0xffffffffULL >> (32-b)))) { }
frag_t(_frag_t e) : _enc(e) { }
// constructors
// accessors
unsigned value() const { return _enc & 0xffffff; }
unsigned bits() const { return _enc >> 24; }
- unsigned mask() const { return 0xffffffff >> (32-bits()); }
+ unsigned mask() const { return 0xffffffffULL >> (32-bits()); }
+
operator _frag_t() const { return _enc; }
// tests
}
// splitting
- void split(int nb, list<frag_t>& fragments) const {
+ void split(int nb, std::list<frag_t>& fragments) const {
assert(nb > 0);
- unsigned nway = 1 << (nb-1);
+ unsigned nway = 1 << nb;
for (unsigned i=0; i<nway; i++)
- fragments.push_back( frag_t(value() | (i << (bits()+nb-1)), bits()+nb) );
+ fragments.push_back( frag_t(value() | (i << bits()),
+ bits()+nb) );
}
// binary splitting
frag_t get_sibling() const {
- assert(bits() > 0);
+ assert(!is_root());
return frag_t(_enc ^ (1 << (bits()-1)));
}
bool is_left() const {
frag_t right_child() const {
return frag_t(value() | (1<<bits()), bits()+1);
}
+
+ // sequencing
+ bool is_leftmost() const {
+ return value() == 0;
+ }
+ bool is_rightmost() const {
+ return value() == mask();
+ }
+ frag_t next() const {
+ assert(!is_rightmost());
+ return frag_t(value() + 1, bits());
+ }
};
-inline ostream& operator<<(ostream& out, frag_t& hb)
+inline std::ostream& operator<<(std::ostream& out, frag_t hb)
{
- return out << hex << hb.value() << dec << "/" << hb.bits();
+ return out << std::hex << hb.value() << std::dec << "/" << hb.bits();
}
bool is_leaf(frag_t x) const {
- list<frag_t> ls;
- get_leaves_under_split(x, ls);
+ std::list<frag_t> ls;
+ get_leaves_under(x, ls);
+ //cout << "is_leaf(" << x << ") -> " << ls << endl;
if (!ls.empty() &&
- ls.front() == x)
+ ls.front() == x &&
+ ls.size() == 1)
return true;
return false;
}
/**
* get_leaves -- list all leaves
*/
- void get_leaves(list<frag_t>& ls) const {
+ void get_leaves(std::list<frag_t>& ls) const {
return get_leaves_under_split(frag_t(), ls);
}
/**
* get_leaves_under_split -- list all leaves under a known split point (or root)
*/
- void get_leaves_under_split(frag_t under, list<frag_t>& ls) const {
- list<frag_t> q;
+ void get_leaves_under_split(frag_t under, std::list<frag_t>& ls) const {
+ std::list<frag_t> q;
q.push_back(under);
while (!q.empty()) {
frag_t t = q.front();
frag_t branch = get_branch(x);
int nb = get_split(branch);
if (nb > 0 && // if branch is a split, and
- branch.bits() + nb <= x.bits()) // one of the children is or contains x
- return frag_t(branch.bits()+nb, x.value()); // then return that child (it's a leaf)
+ branch.bits() + nb <= x.bits()) // one of the children is or contains x
+ return frag_t(x.value(), branch.bits()+nb); // then return that child (it's a leaf)
else
return branch;
}
/**
* get_leaves_under(x, ls) -- search for any leaves fully contained by x
*/
- void get_leaves_under(frag_t x, list<frag_t>& ls) const {
- list<frag_t> q;
+ void get_leaves_under(frag_t x, std::list<frag_t>& ls) const {
+ std::list<frag_t> q;
q.push_back(get_branch(x));
while (!q.empty()) {
frag_t t = q.front();
* contains(fg) -- does fragtree contain the specific frag @x
*/
bool contains(frag_t x) const {
- list<frag_t> q;
+ std::list<frag_t> q;
q.push_back(get_branch(x));
while (!q.empty()) {
frag_t t = q.front();
if (nb == 0) return t; // done.
// pick appropriate child fragment.
- unsigned nway = 1 << (nb-1);
+ unsigned nway = 1 << nb;
unsigned i;
for (i=0; i<nway; i++) {
- frag_t n(t.value() | (i << (t.bits()+nb-1)), t.bits()+nb);
+ frag_t n(t.value() | (i << t.bits()),
+ t.bits()+nb);
if (n.contains(v)) {
t = n;
break;
void try_assimilate_children(frag_t x) {
int nb = get_split(x);
if (!nb) return;
- list<frag_t> children;
+ std::list<frag_t> children;
x.split(nb, children);
int childbits = 0;
- for (list<frag_t>::iterator p = children.begin();
+ for (std::list<frag_t>::iterator p = children.begin();
p != children.end();
++p) {
int cb = get_split(*p);
childbits = cb;
}
// all children are split with childbits!
- for (list<frag_t>::iterator p = children.begin();
+ for (std::list<frag_t>::iterator p = children.begin();
p != children.end();
++p)
_splits.erase(*p);
_splits[x] += childbits;
}
- void force_to_leaf(frag_t x) {
- assert(!is_leaf(x));
+ bool force_to_leaf(frag_t x) {
+ if (is_leaf(x))
+ return false;
+
+ cout << "force_to_leaf " << x << " on " << _splits << endl;
frag_t parent = get_branch_or_leaf(x);
assert(parent.bits() <= x.bits());
+ cout << "parent is " << parent << endl;
// do we need to split from parent to x?
if (parent.bits() < x.bits()) {
int spread = x.bits() - parent.bits();
int nb = get_split(parent);
+ cout << "spread " << spread << ", parent splits by " << nb << endl;
if (nb == 0) {
// easy: split parent (a leaf) by the difference
+ cout << "splitting parent " << parent << " by spread " << spread << endl;
split(parent, spread);
- return;
+ assert(is_leaf(x));
+ return true;
}
assert(nb > spread);
merge(parent, nb);
split(parent, spread);
- list<frag_t> subs;
+ std::list<frag_t> subs;
parent.split(spread, subs);
- for (list<frag_t>::iterator p = subs.begin();
+ for (std::list<frag_t>::iterator p = subs.begin();
p != subs.end();
- ++p)
+ ++p) {
+ cout << "splitting intermediate " << *p << " by " << (nb-spread) << endl;
split(*p, nb - spread);
+ }
}
// x is now a leaf or split.
// hoover up any children.
- list<frag_t> q;
+ std::list<frag_t> q;
q.push_back(x);
while (!q.empty()) {
frag_t t = q.front();
q.pop_front();
int nb = get_split(t);
if (nb) {
+ cout << "merging child " << t << " by " << nb << endl;
merge(t, nb); // merge this point, and
t.split(nb, q); // queue up children
}
- }
+ }
+
+ cout << "force_to_leaf done" << endl;
+ assert(is_leaf(x));
+ return true;
}
// verify that we describe a legal partition of the namespace.
::_decode(_splits, bl, off);
}
- void print(ostream& out) {
+ void print(std::ostream& out) {
out << "fragtree_t(";
- list<frag_t> q;
+ std::list<frag_t> q;
q.push_back(frag_t());
while (!q.empty()) {
frag_t t = q.front();
}
};
-inline ostream& operator<<(ostream& out, fragtree_t& ft)
+inline std::ostream& operator<<(std::ostream& out, fragtree_t& ft)
{
out << "fragtree_t(";
- list<frag_t> q;
- q.push_back(frag_t());
- while (!q.empty()) {
- frag_t t = q.front();
- q.pop_front();
- int nb = ft.get_split(t);
- if (nb) {
- if (t.bits()) out << ' ';
- out << t << '%' << nb;
- t.split(nb, q); // queue up children
+ if (0) {
+ std::list<frag_t> q;
+ q.push_back(frag_t());
+ while (!q.empty()) {
+ frag_t t = q.front();
+ q.pop_front();
+ int nb = ft.get_split(t);
+ if (nb) {
+ if (t.bits()) out << ' ';
+ out << t << '%' << nb;
+ t.split(nb, q); // queue up children
+ }
}
}
+ if (1) {
+ std::list<frag_t> leaves;
+ ft.get_leaves(leaves);
+ out << leaves;
+ }
return out << ")";
}
* fragset_t -- a set of fragments
*/
class fragset_t {
- set<frag_t> _set;
+ std::set<frag_t> _set;
public:
- set<frag_t> &get() { return _set; }
- set<frag_t>::iterator begin() { return _set.begin(); }
- set<frag_t>::iterator end() { return _set.end(); }
+ std::set<frag_t> &get() { return _set; }
+ std::set<frag_t>::iterator begin() { return _set.begin(); }
+ std::set<frag_t>::iterator end() { return _set.end(); }
bool empty() const { return _set.empty(); }
void simplify() {
while (1) {
bool clean = true;
- set<frag_t>::iterator p = _set.begin();
+ std::set<frag_t>::iterator p = _set.begin();
while (p != _set.end()) {
- if (_set.count(p->get_sibling())) {
+ if (!p->is_root() &&
+ _set.count(p->get_sibling())) {
_set.erase(p->get_sibling());
_set.insert(p->parent());
_set.erase(p++);
}
};
-inline ostream& operator<<(ostream& out, fragset_t& fs)
+inline std::ostream& operator<<(std::ostream& out, fragset_t& fs)
{
- return out << "fragset_t(" << fs.get() << ")" << endl;
+ return out << "fragset_t(" << fs.get() << ")";
}
#endif
return tail;
}
+ void clear() {
+ while (len > 0) {
+ remove(get_head());
+ }
+ }
+
void insert_head(LRUObject *o) {
o->lru_next = head;
o->lru_prev = NULL;
LRU(int max = 0) {
lru_num = 0;
lru_num_pinned = 0;
- lru_midpoint = .9;
+ lru_midpoint = .6;
lru_max = max;
}
void lru_set_max(uint32_t m) { lru_max = m; }
void lru_set_midpoint(float f) { lru_midpoint = f; }
+ void lru_clear() {
+ lru_top.clear();
+ lru_bot.clear();
+ lru_pintail.clear();
+ }
// insert at top of lru
void lru_insert_top(LRUObject *o) {
struct FileLayout {
// -- file -> object mapping --
- int stripe_unit; // stripe unit, in bytes
- int stripe_count; // over this many objects
- int object_size; // until objects are this big, then move to new objects
+ int32_t stripe_unit; // stripe unit, in bytes
+ int32_t stripe_count; // over this many objects
+ int32_t object_size; // until objects are this big, then move to new objects
int stripe_width() { return stripe_unit * stripe_count; }
// -- object -> pg layout --
char pg_type; // pg type (replicated, raid, etc.) (see pg_t::TYPE_*)
char pg_size; // pg size (num replicas, or raid4 stripe width)
- int preferred; // preferred primary osd?
+ int32_t preferred; // preferred primary osd?
// -- pg -> disk layout --
- int object_stripe_unit; // for per-object raid
+ int32_t object_stripe_unit; // for per-object raid
FileLayout() { }
FileLayout(int su, int sc, int os, int pgt, int pgs, int o=-1) :
#define FILE_MODE_RW (1|2)
#define FILE_MODE_LAZY 4
-#define INODE_MASK_BASE 1 // ino, layout, symlink value
-#define INODE_MASK_AUTH 2 // uid, gid, mode
-#define INODE_MASK_LINK 4 // nlink, anchored
-#define INODE_MASK_FILE 8 // mtime, size.
-// atime?
-
-#define INODE_MASK_ALL_STAT (INODE_MASK_BASE|INODE_MASK_AUTH|INODE_MASK_LINK|INODE_MASK_FILE)
-
-#define INODE_MASK_SIZE INODE_MASK_FILE // size, blksize, blocks
-#define INODE_MASK_MTIME INODE_MASK_FILE // mtime
-#define INODE_MASK_ATIME INODE_MASK_FILE // atime
-#define INODE_MASK_CTIME (INODE_MASK_FILE|INODE_MASK_AUTH|INODE_MASK_LINK) // ctime
+/** stat masks
+ */
+#define STAT_MASK_INO 1 // inode nmber
+#define STAT_MASK_TYPE 2 // file type bits of the mode
+#define STAT_MASK_BASE 4 // layout, symlink value
+#define STAT_MASK_AUTH 8 // uid, gid, mode
+#define STAT_MASK_LINK 16 // nlink, anchored
+#define STAT_MASK_FILE 32 // mtime, size.
+
+#define STAT_MASK_ALL 63
+
+#define STAT_MASK_SIZE STAT_MASK_FILE // size, blksize, blocks
+#define STAT_MASK_MTIME STAT_MASK_FILE // mtime
+#define STAT_MASK_ATIME STAT_MASK_FILE // atime
+#define STAT_MASK_CTIME (STAT_MASK_FILE|STAT_MASK_AUTH|STAT_MASK_LINK) // ctime
+
+inline int DT_TO_MODE(int dt) {
+ return dt << 12;
+ /*
+ switch (dt) {
+ case DT_REG: return INODE_MODE_FILE;
+ case DT_DIR: return INODE_MODE_DIR;
+ case DT_LNK: return INODE_MODE_SYMLINK;
+ default: assert(0); return 0;
+ }
+ */
+}
struct inode_t {
// base (immutable)
inodeno_t ino;
FileLayout layout; // ?immutable?
+ dev_t rdev; // if special file
// affected by any inode change...
utime_t ctime; // inode change time
gid_t gid;
// nlink
- int nlink;
+ int32_t nlink;
bool anchored; // auth only?
// file (data access)
utime_t atime; // file data access time.
// special stuff
- int mask; // used for client stat. hack.
- version_t version; // auth only
- version_t file_data_version; // auth only
+ version_t version; // auth only
+ version_t file_data_version; // auth only
+ // file type
bool is_symlink() { return (mode & INODE_TYPE_MASK) == INODE_MODE_SYMLINK; }
bool is_dir() { return (mode & INODE_TYPE_MASK) == INODE_MODE_DIR; }
bool is_file() { return (mode & INODE_TYPE_MASK) == INODE_MODE_FILE; }
+
+ // corresponding d_types
+ static const unsigned char DT_REG = 8;
+ static const unsigned char DT_DIR = 4;
+ static const unsigned char DT_LNK = 10;
};
+inline unsigned char MODE_TO_DT(int mode) {
+ return mode >> 12;
+ /*
+ if (S_ISREG(mode)) return inode_t::DT_REG;
+ if (S_ISLNK(mode)) return inode_t::DT_LNK;
+ if (S_ISDIR(mode)) return inode_t::DT_DIR;
+ assert(0);
+ return 0;
+ */
+}
+
// is just casting it to long& OK?
long& usec_ref() { return (long&) tv.tv_usec; }
+ struct timeval& tv_ref() { return tv; }
+
// cast to double
operator double() {
return (double)sec() + ((double)usec() / 1000000.0L);
}
inline utime_t& operator+=(utime_t& l, double f) {
double fs = trunc(f);
- double us = (f - fs) / (double)1000000.0;
+ double us = (f - fs) * (double)1000000.0;
l.sec_ref() += (long)fs;
l.usec_ref() += (long)us;
l.normalize();
time_t tt = t.sec();
localtime_r(&tt, &bdt);
out << std::setw(2) << (bdt.tm_year-100) // 2007 -> '07'
- << std::setw(2) << bdt.tm_mon
+ << std::setw(2) << (bdt.tm_mon+1)
<< std::setw(2) << bdt.tm_mday
<< "."
<< std::setw(2) << bdt.tm_hour
case ANCHOR_OP_COMMIT: return "commit";
case ANCHOR_OP_ACK: return "ack";
case ANCHOR_OP_ROLLBACK: return "rollback";
- default: assert(0);
+ default: assert(0); return 0;
}
}
#include <cassert>
#undef dout
-#define dout(x) if (x <= g_conf.debug || x <= g_conf.debug_mds) cout << g_clock.now() << " mds" << dir->cache->mds->get_nodeid() << ".cache.den(" << dir->ino() << " " << name << ") "
+#define dout(x) if (x <= g_conf.debug || x <= g_conf.debug_mds) cout << g_clock.now() << " mds" << dir->cache->mds->get_nodeid() << ".cache.den(" << dir->dirfrag() << " " << name << ") "
ostream& CDentry::print_db_line_prefix(ostream& out)
{
dn.make_path(path);
out << "[dentry " << path;
+
if (dn.is_auth()) {
out << " auth";
if (dn.is_replicated())
}
if (dn.is_null()) out << " NULL";
- if (dn.is_remote()) out << " REMOTE";
+ if (dn.is_remote()) {
+ out << " REMOTE(";
+ switch (dn.get_remote_d_type()) {
+ case inode_t::DT_REG: out << "reg"; break;
+ case inode_t::DT_DIR: out << "dir"; break;
+ case inode_t::DT_LNK: out << "lnk"; break;
+ default: assert(0);
+ }
+ out << ")";
+ }
out << " " << dn.lock;
dout(10) << " mark_dirty " << *this << endl;
// i now live in this new dir version
- assert(pv == projected_version);
+ assert(pv <= projected_version);
version = pv;
_mark_dirty();
void CDentry::auth_pin()
{
- assert(dir);
- dir->auth_pin();
+ if (auth_pins == 0)
+ get(PIN_AUTHPIN);
+ auth_pins++;
+ dir->adjust_nested_auth_pins(1);
}
void CDentry::auth_unpin()
{
- assert(dir);
- dir->auth_unpin();
+ auth_pins--;
+ if (auth_pins == 0)
+ put(PIN_AUTHPIN);
+ dir->adjust_nested_auth_pins(-1);
+}
+
+void CDentry::adjust_nested_auth_pins(int by)
+{
+ nested_auth_pins += by;
+ dir->adjust_nested_auth_pins(by);
}
public:
// -- state --
static const int STATE_NEW = 1;
+ static const int STATE_FRAGMENTING = 2;
// -- pins --
- static const int PIN_INODEPIN = 1; // linked inode is pinned
+ static const int PIN_INODEPIN = 1; // linked inode is pinned
+ static const int PIN_FRAGMENTING = -2; // containing dir is refragmenting
const char *pin_name(int p) {
switch (p) {
case PIN_INODEPIN: return "inodepin";
+ case PIN_FRAGMENTING: return "fragmenting";
default: return generic_pin_name(p);
}
};
}
protected:
- string name;
- CInode *inode;
- CDir *dir;
+ string name;
- inodeno_t remote_ino; // if remote dentry
+ inodeno_t remote_ino; // if remote dentry
+ unsigned char remote_d_type;
- version_t version; // dir version when last touched.
- version_t projected_version; // what it will be when i unlock/commit.
+ CInode *inode; // linked inode (if any)
+ CDir *dir; // containing dirfrag
+ version_t version; // dir version when last touched.
+ version_t projected_version; // what it will be when i unlock/commit.
+
+ off_t dir_offset;
+
+ int auth_pins, nested_auth_pins;
friend class Migrator;
friend class Locker;
public:
// cons
CDentry() :
- inode(0),
- dir(0),
- remote_ino(0),
- version(0),
- projected_version(0),
+ remote_ino(0), remote_d_type(0),
+ inode(0), dir(0),
+ version(0), projected_version(0),
+ dir_offset(0),
+ auth_pins(0), nested_auth_pins(0),
lock(this, LOCK_OTYPE_DN, WAIT_LOCK_OFFSET) { }
- CDentry(const string& n, inodeno_t ino, CInode *in=0) :
+ CDentry(const string& n, CInode *in) :
name(n),
- inode(in),
- dir(0),
- remote_ino(ino),
- version(0),
- projected_version(0),
+ remote_ino(0), remote_d_type(0),
+ inode(in), dir(0),
+ version(0), projected_version(0),
+ dir_offset(0),
+ auth_pins(0), nested_auth_pins(0),
lock(this, LOCK_OTYPE_DN, WAIT_LOCK_OFFSET) { }
- CDentry(const string& n, CInode *in) :
+ CDentry(const string& n, inodeno_t ino, unsigned char dt, CInode *in=0) :
name(n),
- inode(in),
- dir(0),
- remote_ino(0),
- version(0),
- projected_version(0),
+ remote_ino(ino), remote_d_type(dt),
+ inode(in), dir(0),
+ version(0), projected_version(0),
+ dir_offset(0),
+ auth_pins(0), nested_auth_pins(0),
lock(this, LOCK_OTYPE_DN, WAIT_LOCK_OFFSET) { }
CInode *get_inode() const { return inode; }
CDir *get_dir() const { return dir; }
const string& get_name() const { return name; }
inodeno_t get_ino();
- inodeno_t get_remote_ino() { return remote_ino; }
- void set_remote_ino(inodeno_t ino) { remote_ino = ino; }
+ off_t get_dir_offset() { return dir_offset; }
+ void set_dir_offset(off_t o) { dir_offset = o; }
+ void clear_dir_offset() { dir_offset = 0; }
+ inodeno_t get_remote_ino() { return remote_ino; }
+ unsigned char get_remote_d_type() { return remote_d_type; }
+ void set_remote(inodeno_t ino, unsigned char d_type) {
+ remote_ino = ino;
+ remote_d_type = d_type;
+ }
// ref counts: pin ourselves in the LRU when we're pinned.
void first_get() {
bool can_auth_pin();
void auth_pin();
void auth_unpin();
+ void adjust_nested_auth_pins(int by);
// dentry type is primary || remote || null
string dname;
int replica_nonce;
int lockstate;
-
+ off_t dir_offset;
inodeno_t remote_ino;
+ unsigned char remote_d_type;
public:
CDentryDiscover() {}
CDentryDiscover(CDentry *dn, int nonce) :
dname(dn->get_name()), replica_nonce(nonce),
lockstate(dn->lock.get_replica_state()),
- remote_ino(dn->get_remote_ino()) { }
+ dir_offset(dn->get_dir_offset()),
+ remote_ino(dn->get_remote_ino()), remote_d_type(dn->get_remote_d_type()) { }
string& get_dname() { return dname; }
int get_nonce() { return replica_nonce; }
bool is_remote() { return remote_ino ? true:false; }
inodeno_t get_remote_ino() { return remote_ino; }
+ unsigned char get_remote_d_type() { return remote_d_type; }
void update_dentry(CDentry *dn) {
- dn->set_replica_nonce( replica_nonce );
+ dn->set_dir_offset(dir_offset);
+ dn->set_replica_nonce(replica_nonce);
}
void init_dentry_lock(CDentry *dn) {
dn->lock.set_state( lockstate );
void _encode(bufferlist& bl) {
::_encode(dname, bl);
+ ::_encode(dir_offset, bl);
::_encode(remote_ino, bl);
+ ::_encode(remote_d_type, bl);
::_encode(replica_nonce, bl);
::_encode(lockstate, bl);
}
void _decode(bufferlist& bl, int& off) {
::_decode(dname, bl, off);
+ ::_decode(dir_offset, bl, off);
::_decode(remote_ino, bl, off);
+ ::_decode(remote_d_type, bl, off);
::_decode(replica_nonce, bl, off);
::_decode(lockstate, bl, off);
}
*/
+#include "include/types.h"
#include "CDir.h"
#include "CDentry.h"
{
string path;
dir.get_inode()->make_path(path);
- out << "[dir " << dir.ino();
- if (!dir.frag.is_root()) out << "%" << dir.frag;
- out << " " << path << "/";
+ out << "[dir " << dir.dirfrag() << " " << path << "/";
if (dir.is_auth()) {
out << " auth";
if (dir.is_replicated())
#include "config.h"
#undef dout
-#define dout(x) if (x <= g_conf.debug || x <= g_conf.debug_mds) cout << g_clock.now() << " mds" << cache->mds->get_nodeid() << ".cache.dir(" << get_inode()->inode.ino << ") "
+#define dout(x) if (x <= g_conf.debug || x <= g_conf.debug_mds) cout << g_clock.now() << " mds" << cache->mds->get_nodeid() << ".cache.dir(" << this->dirfrag() << ") "
//#define dout(x) if (x <= g_conf.debug || x <= g_conf.debug_mds) cout << g_clock.now() << " mds" << cache->mds->get_nodeid() << ".cache." << *this << " "
ostream& CDir::print_db_line_prefix(ostream& out)
{
- return out << g_clock.now() << " mds" << cache->mds->get_nodeid() << ".cache.dir(" << get_inode()->inode.ino << ") ";
+ return out << g_clock.now() << " mds" << cache->mds->get_nodeid() << ".cache.dir(" << this->dirfrag() << ") ";
}
projected_version = version = 0;
committing_version = 0;
- committed_version = 0;
+ committed_version_equivalent = committed_version = 0;
// dir_auth
dir_auth = CDIR_AUTH_DEFAULT;
* linking fun
*/
-CDentry* CDir::add_dentry( const string& dname, inodeno_t ino)
+CDentry* CDir::add_null_dentry(const string& dname)
{
// foreign
assert(lookup(dname) == 0);
// create dentry
- CDentry* dn = new CDentry(dname, ino);
+ CDentry* dn = new CDentry(dname, 0);
if (is_auth())
dn->state_set(CDentry::STATE_AUTH);
cache->lru.lru_insert_mid(dn);
//assert(null_items.count(dn->name) == 0);
items[dn->name] = dn;
- nitems++;
+ nnull++;
- dout(12) << "add_dentry " << *dn << endl;
+ dout(12) << "add_null_dentry " << *dn << endl;
// pin?
if (nnull + nitems == 1) get(PIN_CHILD);
}
-CDentry* CDir::add_dentry( const string& dname, CInode *in)
+CDentry* CDir::add_primary_dentry(const string& dname, CInode *in)
{
// primary
assert(lookup(dname) == 0);
//assert(null_items.count(dn->name) == 0);
items[dn->name] = dn;
+ link_inode_work( dn, in );
- if (in) {
- link_inode_work( dn, in );
- } else {
- assert(dn->inode == 0);
- //null_items[dn->name] = dn;
- nnull++;
- }
+ dout(12) << "add_primary_dentry " << *dn << endl;
+
+ // pin?
+ if (nnull + nitems == 1) get(PIN_CHILD);
+
+ assert(nnull + nitems == items.size());
+ //assert(nnull == null_items.size());
+ return dn;
+}
+
+CDentry* CDir::add_remote_dentry(const string& dname, inodeno_t ino, unsigned char d_type)
+{
+ // foreign
+ assert(lookup(dname) == 0);
+
+ // create dentry
+ CDentry* dn = new CDentry(dname, ino, d_type);
+ if (is_auth())
+ dn->state_set(CDentry::STATE_AUTH);
+ cache->lru.lru_insert_mid(dn);
+
+ dn->dir = this;
+ dn->version = projected_version;
+
+ // add to dir
+ assert(items.count(dn->name) == 0);
+ //assert(null_items.count(dn->name) == 0);
+
+ items[dn->name] = dn;
+ nitems++;
- dout(12) << "add_dentry " << *dn << endl;
+ dout(12) << "add_remote_dentry " << *dn << endl;
// pin?
if (nnull + nitems == 1) get(PIN_CHILD);
//assert(nnull == null_items.size());
}
-void CDir::link_inode( CDentry *dn, inodeno_t ino)
+void CDir::link_remote_inode(CDentry *dn, inodeno_t ino, unsigned char d_type)
{
dout(12) << "link_inode " << *dn << " remote " << ino << endl;
assert(dn->is_null());
- dn->set_remote_ino(ino);
+ dn->set_remote(ino, d_type);
nitems++;
+ dn->clear_dir_offset();
//assert(null_items.count(dn->name) == 1);
//null_items.erase(dn->name);
assert(nnull + nitems == items.size());
}
-void CDir::link_inode( CDentry *dn, CInode *in )
+void CDir::link_primary_inode(CDentry *dn, CInode *in)
{
- dout(12) << "link_inode " << *dn << " " << *in << endl;
+ dout(12) << "link_primary_inode " << *dn << " " << *in << endl;
assert(!dn->is_remote());
link_inode_work(dn,in);
+ dn->clear_dir_offset();
// remove from null list
//assert(null_items.count(dn->name) == 1);
//assert(nnull == null_items.size());
}
-void CDir::link_inode_work( CDentry *dn, CInode *in )
+void CDir::link_inode_work( CDentry *dn, CInode *in)
{
dn->inode = in;
in->set_primary_parent(dn);
// adjust auth pin count
if (in->auth_pins + in->nested_auth_pins)
- adjust_nested_auth_pins( in->auth_pins + in->nested_auth_pins );
+ dn->adjust_nested_auth_pins(in->auth_pins + in->nested_auth_pins);
}
void CDir::unlink_inode( CDentry *dn )
dout(12) << "unlink_inode " << *dn << " " << *dn->inode << endl;
}
+ dn->clear_dir_offset();
unlink_inode_work(dn);
// add to null list
void CDir::try_remove_unlinked_dn(CDentry *dn)
{
assert(dn->dir == this);
+ assert(dn->is_null());
+ assert(dn->is_dirty());
- if (dn->is_new() && dn->is_dirty() &&
- dn->get_num_ref() == 1) {
+ // no pins (besides dirty)?
+ if (dn->get_num_ref() != 1)
+ return;
+
+ // was the dn new? or is the dir complete (i.e. we don't need negatives)?
+ if (dn->is_new() || is_complete()) {
dout(10) << "try_remove_unlinked_dn " << *dn << " in " << *this << endl;
dn->mark_clean();
remove_dentry(dn);
void CDir::unlink_inode_work( CDentry *dn )
{
CInode *in = dn->inode;
-
+
if (dn->is_remote()) {
// remote
if (in)
dn->unlink_remote();
- dn->set_remote_ino(0);
+ dn->set_remote(0, 0);
} else {
// primary
assert(dn->is_primary());
// unlink auth_pin count
if (in->auth_pins + in->nested_auth_pins)
- adjust_nested_auth_pins( 0 - (in->auth_pins + in->nested_auth_pins) );
+ dn->adjust_nested_auth_pins(0 - (in->auth_pins + in->nested_auth_pins));
// detach inode
in->remove_primary_parent(dn);
}
+/**
+ * steal_dentry -- semi-violently move a dentry from one CDir to another
+ * (*) violently, in that nitems, most pins, etc. are not correctly maintained
+ * on the old CDir corpse; must call purge_stolen() when finished.
+ */
void CDir::steal_dentry(CDentry *dn)
{
dout(15) << "steal_dentry " << *dn << endl;
items[dn->name] = dn;
- if (nitems == 0)
+ dn->dir->items.erase(dn->name);
+ if (dn->dir->items.empty())
+ dn->dir->put(PIN_CHILD);
+
+ if (nnull + nitems == 0)
get(PIN_CHILD);
- nitems++;
if (dn->is_null())
nnull++;
- if (dn->is_primary())
- nested_auth_pins += dn->inode->auth_pins + dn->inode->nested_auth_pins;
+ else
+ nitems++;
+
+ nested_auth_pins += dn->auth_pins + dn->nested_auth_pins;
if (dn->is_dirty())
num_dirty++;
void CDir::purge_stolen(list<Context*>& waiters)
{
- if (!items.empty()) {
- put(PIN_CHILD);
- items.clear();
+ // take waiters _before_ unfreeze...
+ take_waiting(WAIT_ANY, waiters);
+
+ if (is_auth()) {
+ assert(is_frozen_dir());
+ unfreeze_dir();
}
- if (is_dirty()) mark_clean();
+ nnull = nitems = 0;
+ if (is_auth())
+ clear_replica_map();
+ if (is_dirty()) mark_clean();
if (state_test(STATE_EXPORT)) put(PIN_EXPORT);
if (state_test(STATE_IMPORTBOUND)) put(PIN_IMPORTBOUND);
if (state_test(STATE_EXPORTBOUND)) put(PIN_EXPORTBOUND);
- if (state_test(STATE_FROZENDIR)) put(PIN_FROZEN);
if (auth_pins > 0) put(PIN_AUTHPIN);
- take_waiting(WAIT_ANY, waiters);
-
- assert(get_num_ref() == 0);
+ assert(get_num_ref() == (state_test(STATE_STICKY) ? 1:0));
}
void CDir::init_fragment_pins()
{
+ if (!replica_map.empty()) get(PIN_REPLICATED);
if (state_test(STATE_DIRTY)) get(PIN_DIRTY);
- if (state_test(STATE_FROZENDIR)) get(PIN_FROZEN);
if (state_test(STATE_EXPORT)) get(PIN_EXPORT);
if (state_test(STATE_EXPORTBOUND)) get(PIN_EXPORTBOUND);
if (state_test(STATE_IMPORTBOUND)) get(PIN_IMPORTBOUND);
- if (state_test(STATE_STICKY)) get(PIN_STICKY);
}
void CDir::split(int bits, list<CDir*>& subs, list<Context*>& waiters)
{
- dout(10) << "split by " << bits << " bits" << endl;
-
- assert(is_complete());
+ dout(10) << "split by " << bits << " bits on " << *this << endl;
+
+ if (cache->mds->logger) cache->mds->logger->inc("dir_sp");
+
+ assert(is_complete() || !is_auth());
list<frag_t> frags;
frag.split(bits, frags);
vector<CDir*> subfrags(1 << bits);
// create subfrag dirs
+ int n = 0;
for (list<frag_t>::iterator p = frags.begin(); p != frags.end(); ++p) {
- CDir *f = new CDir(inode, *p, cache, true);
+ CDir *f = new CDir(inode, *p, cache, is_auth());
f->state_set(state & MASK_STATE_FRAGMENT_KEPT);
- f->init_fragment_pins();
- f->set_version(get_version());
f->replica_map = replica_map;
+ f->dir_auth = dir_auth;
+ f->init_fragment_pins();
+ f->version = version;
+ f->projected_version = projected_version;
dout(10) << " subfrag " << *p << " " << *f << endl;
- subfrags.push_back(f);
+ subfrags[n++] = f;
+ subs.push_back(f);
inode->add_dirfrag(f);
}
- assert(subfrags.size() == frags.size());
// repartition dentries
while (!items.empty()) {
// merge state
state_set(dir->get_state() & MASK_STATE_FRAGMENT_KEPT);
+ dir_auth = dir->dir_auth;
dir->purge_stolen(waiters);
inode->close_dirfrag(dir->get_frag());
void CDir::first_get()
{
- inode->get(CInode::PIN_DIR);
+ inode->get(CInode::PIN_DIRFRAG);
}
void CDir::last_put()
{
- inode->put(CInode::PIN_DIR);
+ inode->put(CInode::PIN_DIRFRAG);
}
}
};
-void CDir::fetch(Context *c)
+void CDir::fetch(Context *c, bool ignore_authpinnability)
{
dout(10) << "fetch on " << *this << endl;
assert(is_auth());
assert(!is_complete());
+ if (!can_auth_pin() && !ignore_authpinnability) {
+ dout(7) << "fetch waiting for authpinnable" << endl;
+ add_waiter(WAIT_AUTHPINNABLE, c);
+ return;
+ }
+
if (c) add_waiter(WAIT_COMPLETE, c);
// already fetching?
return;
}
+ auth_pin();
state_set(CDir::STATE_FETCHING);
- if (cache->mds->logger) cache->mds->logger->inc("fdir");
+ if (cache->mds->logger) cache->mds->logger->inc("dir_f");
// start by reading the first hunk of it
C_Dir_Fetch *fin = new C_Dir_Fetch(this);
void CDir::_fetched(bufferlist &bl)
{
- dout(10) << "_fetched " << 0 << "~" << bl.length()
- << " on " << *this
+ dout(10) << "_fetched " << bl.length()
+ << " bytes for " << *this
<< endl;
- // give up?
- if (!is_auth() || is_frozen()) {
- dout(10) << "_fetched canceling (!auth or frozen)" << endl;
- //ondisk_bl.clear();
- //ondisk_size = 0;
-
- // kick waiters?
- state_clear(CDir::STATE_FETCHING);
- finish_waiting(WAIT_COMPLETE, -1);
- return;
- }
+ assert(is_auth());
+ assert(!is_frozen());
// decode.
int len = bl.length();
int off = 0;
- version_t got_version;
+ version_t got_version;
- bl.copy(off, sizeof(got_version), (char*)&got_version);
- off += sizeof(got_version);
+ ::_decode(got_version, bl, off);
dout(10) << "_fetched version " << got_version
<< ", " << len << " bytes"
<< endl;
- while (off < len) {
+ int32_t n;
+ ::_decode(n, bl, off);
+
+ for (int i=0; i<n; i++) {
+ off_t dn_offset = off;
+
// marker
char type = bl[off];
++off;
// dname
string dname;
::_decode(dname, bl, off);
- dout(24) << "_fetched parsed marker '" << type << "' dname '" << dname << "'" << endl;
+ dout(24) << "_fetched parsed marker '" << type << "' dname '" << dname << endl;
CDentry *dn = lookup(dname); // existing dentry?
if (type == 'L') {
// hard link
inodeno_t ino;
- bl.copy(off, sizeof(ino), (char*)&ino);
- off += sizeof(ino);
-
+ unsigned char d_type;
+ ::_decode(ino, bl, off);
+ ::_decode(d_type, bl, off);
+
if (dn) {
if (dn->get_inode() == 0) {
dout(12) << "_fetched had NEG dentry " << *dn << endl;
}
} else {
// (remote) link
- CDentry *dn = add_dentry( dname, ino );
+ dn = add_remote_dentry(dname, ino, d_type);
// link to inode?
CInode *in = cache->get_inode(ino); // we may or may not have it.
cache->add_inode( in );
// link
- add_dentry( dname, in );
- dout(12) << "_fetched got " << *in << " mode " << in->inode.mode << " mtime " << in->inode.mtime << endl;
+ dn = add_primary_dentry(dname, in);
+ dout(12) << "_fetched got " << *dn << " " << *in << endl;
}
}
} else {
<< " at pos " << off << endl;
assert(0);
}
+
+ // make note of dentry position in the directory
+ dn->dir_offset = dn_offset;
/** clean underwater item?
* Underwater item is something that is dirty in our cache from
/**
* commit
*
- * @param want min version i want committed
- * @param c callback for completion
+ * @param want - min version i want committed
+ * @param c - callback for completion
*/
void CDir::commit(version_t want, Context *c)
{
// preconditions
assert(want <= version || version == 0); // can't commit the future
- assert(committed_version < want); // the caller is stupid
+ assert(want > committed_version); // the caller is stupid
assert(is_auth());
assert(can_auth_pin());
// complete?
if (!is_complete()) {
dout(7) << "commit not complete, fetching first" << endl;
+ if (cache->mds->logger) cache->mds->logger->inc("dir_ffc");
fetch(new C_Dir_RetryCommit(this, want));
return;
}
state_set(STATE_COMMITTING);
}
- if (cache->mds->logger) cache->mds->logger->inc("cdir");
+ if (cache->mds->logger) cache->mds->logger->inc("dir_c");
- // encode dentries
+ // encode
bufferlist bl;
- bl.append((char*)&version, sizeof(version));
-
+
+ ::_encode(version, bl);
+ int32_t n = nitems;
+ ::_encode(n, bl);
+
for (CDir_map_t::iterator it = items.begin();
it != items.end();
it++) {
if (dn->is_null())
continue; // skip negative entries
+ n--;
+
// primary or remote?
if (dn->is_remote()) {
inodeno_t ino = dn->get_remote_ino();
in->dirfragtree._encode(bl);
}
}
+ assert(n == 0);
// write it.
cache->mds->objecter->write( get_ondisk_object(),
/** set_dir_auth
- *
- * always list ourselves first.
- *
- * accept 'iamauth' param so that i can intelligently adjust freeze auth_pins
- * even when the auth bit isn't correct.
- * as when calling MDCache::import_subtree(...).
*/
-void CDir::set_dir_auth(pair<int,int> a, bool iamauth)
+void CDir::set_dir_auth(pair<int,int> a)
{
dout(10) << "setting dir_auth=" << a
<< " from " << dir_auth
if (is_subtree_root()) return; // no.
//assert(!is_import());
- inode->nested_auth_pins++;
- if (inode->parent)
- inode->parent->dir->adjust_nested_auth_pins( 1 );
+ inode->adjust_nested_auth_pins(1);
}
void CDir::auth_unpin()
assert(auth_pins >= 0);
// pending freeze?
- if (auth_pins + nested_auth_pins == 0)
- on_freezeable();
+ if (state_test(STATE_FREEZINGTREE|STATE_FREEZINGDIR) &&
+ auth_pins == 1 &&
+ nested_auth_pins == 0)
+ finish_waiting(WAIT_FREEZEABLE);
// nest?
if (is_subtree_root()) return; // no.
//assert(!is_import());
- inode->nested_auth_pins--;
- if (inode->parent)
- inode->parent->dir->adjust_nested_auth_pins( -1 );
+ inode->adjust_nested_auth_pins(-1);
}
void CDir::adjust_nested_auth_pins(int inc)
{
- CDir *dir = this;
-
- // dir
- dir->nested_auth_pins += inc;
+ nested_auth_pins += inc;
- dout(10) << "adjust_nested_auth_pins " << inc << " on " << *dir << " count now " << dir->auth_pins << " + " << dir->nested_auth_pins << endl;
- assert(dir->nested_auth_pins >= 0);
+ dout(10) << "adjust_nested_auth_pins " << inc << " on " << *this
+ << " count now " << auth_pins << " + " << nested_auth_pins << endl;
+ assert(nested_auth_pins >= 0);
// pending freeze?
- if (is_freezeable())
- dir->on_freezeable();
- // on freezeable_dir too? FIXME
+ if (state_test(STATE_FREEZINGTREE|STATE_FREEZINGDIR) &&
+ auth_pins == 1 &&
+ nested_auth_pins == 0)
+ finish_waiting(WAIT_FREEZEABLE);
// adjust my inode?
- if (dir->is_subtree_root())
+ if (is_subtree_root())
return; // no, stop.
// yes.
- dir->inode->adjust_nested_auth_pins(inc);
+ inode->adjust_nested_auth_pins(inc);
}
* FREEZING
*/
-void CDir::on_freezeable()
-{
- // check for anything pending freezeable
-
- /* NOTE: this will be called on deeper dirs first, walking up toward
- the root, meaning that deeper freeze attempts will succeed first.
- */
- /* NOTE: the first of these will likely freeze the dir, and unmark
- FREEZING. additional ones will re-flag FREEZING. this isn't
- particularly graceful, and might cause problems if the first one
- needs to know about other waiters.... FIXME? */
-
- finish_waiting(WAIT_FREEZEABLE);
-}
-
// FREEZE TREE
class C_MDS_FreezeTree : public Context {
{
assert(!is_frozen());
assert(!is_freezing());
+
+ auth_pin();
if (is_freezeable()) {
- dout(10) << "freeze_tree " << *this << endl;
- _freeze_tree(c);
+ _freeze_tree();
+ auth_unpin();
+ if (c) {
+ c->finish(0);
+ delete c;
+ }
} else {
state_set(STATE_FREEZINGTREE);
- dout(10) << "freeze_tree + wait " << *this << endl;
-
- // need to wait for auth pins to expire
+ dout(10) << "freeze_tree waiting " << *this << endl;
add_waiter(WAIT_FREEZEABLE, new C_MDS_FreezeTree(this, c));
}
}
-void CDir::_freeze_tree(Context *c)
+void CDir::_freeze_tree()
{
dout(10) << "_freeze_tree " << *this << endl;
- // there shouldn't be any conflicting auth_pins.
- assert(is_freezeable_dir());
+ // there shouldn't be any conflicting auth_pins (except the 'freezing' one)
+ assert(is_freezeable(true));
// twiddle state
state_clear(STATE_FREEZINGTREE); // actually, this may get set again by next context?
// auth_pin inode for duration of freeze, if we are not a subtree root.
if (is_auth() && !is_subtree_root())
- inode->auth_pin();
-
- // continue to frozen land
- if (c) {
- c->finish(0);
- delete c;
- }
+ inode->auth_pin();
}
void CDir::freeze_tree_finish(Context *c)
}
// freezeable now?
- if (!is_freezeable()) {
- // wait again!
+ if (!is_freezeable(true)) {
dout(10) << "freeze_tree_finish still waiting " << *this << endl;
- state_set(STATE_FREEZINGTREE);
+ assert(state_test(STATE_FREEZINGTREE));
add_waiter(WAIT_FREEZEABLE, new C_MDS_FreezeTree(this, c));
return;
}
dout(10) << "freeze_tree_finish " << *this << endl;
- _freeze_tree(c);
+ _freeze_tree();
+ auth_unpin();
+ if (c) {
+ c->finish(0);
+ delete c;
+ }
}
void CDir::unfreeze_tree()
// freezing. stop it.
assert(state_test(STATE_FREEZINGTREE));
state_clear(STATE_FREEZINGTREE);
+ auth_unpin();
// cancel freeze waiters
finish_waiting(WAIT_UNFREEZE);
assert(!is_frozen());
assert(!is_freezing());
+ auth_pin();
if (is_freezeable_dir()) {
- dout(10) << "freeze_dir " << *this << endl;
- _freeze_dir(c);
+ _freeze_dir();
+ auth_unpin();
+ if (c) {
+ c->finish(0);
+ delete c;
+ }
} else {
state_set(STATE_FREEZINGDIR);
dout(10) << "freeze_dir + wait " << *this << endl;
-
- // need to wait for auth pins to expire
add_waiter(WAIT_FREEZEABLE, new C_MDS_FreezeDir(this, c));
}
}
-void CDir::_freeze_dir(Context *c)
+void CDir::_freeze_dir()
{
dout(10) << "_freeze_dir " << *this << endl;
- assert(is_freezeable_dir());
+ assert(is_freezeable_dir(true));
state_clear(STATE_FREEZINGDIR);
state_set(STATE_FROZENDIR);
if (is_auth() && !is_subtree_root())
inode->auth_pin(); // auth_pin for duration of freeze
-
- if (c) {
- c->finish(0);
- delete c;
- }
}
void CDir::freeze_dir_finish(Context *c)
}
// freezeable now?
- if (!is_freezeable_dir()) {
- // wait again!
+ if (!is_freezeable_dir(true)) {
dout(10) << "freeze_dir_finish still waiting " << *this << endl;
state_set(STATE_FREEZINGDIR);
add_waiter(WAIT_FREEZEABLE, new C_MDS_FreezeDir(this, c));
}
// freeze now
- _freeze_dir(c);
+ dout(10) << "freeze_dir_finish " << *this << endl;
+ _freeze_dir();
+ auth_unpin();
+ if (c) {
+ c->finish(0);
+ delete c;
+ }
}
void CDir::unfreeze_dir()
// still freezing. stop.
assert(state_test(STATE_FREEZINGDIR));
state_clear(STATE_FREEZINGDIR);
+ auth_unpin();
// cancel freeze waiters
finish_waiting(WAIT_UNFREEZE);
static const int PIN_DNWAITER = 1;
static const int PIN_CHILD = 2;
static const int PIN_FROZEN = 3;
- static const int PIN_FRAGMENTING = 4;
static const int PIN_EXPORT = 5;
- static const int PIN_AUTHPIN = 6;
static const int PIN_IMPORTING = 7;
static const int PIN_EXPORTING = 8;
static const int PIN_IMPORTBOUND = 9;
case PIN_DNWAITER: return "dnwaiter";
case PIN_CHILD: return "child";
case PIN_FROZEN: return "frozen";
- case PIN_FRAGMENTING: return "fragmenting";
case PIN_EXPORT: return "export";
case PIN_EXPORTING: return "exporting";
case PIN_IMPORTING: return "importing";
case PIN_IMPORTBOUND: return "importbound";
case PIN_EXPORTBOUND: return "exportbound";
- case PIN_AUTHPIN: return "authpin";
case PIN_STICKY: return "sticky";
default: return generic_pin_name(p);
}
}
// -- state --
- static const unsigned STATE_COMPLETE = (1<< 2); // the complete contents are in cache
- static const unsigned STATE_FROZENTREE = (1<< 4); // root of tree (bounded by exports)
- static const unsigned STATE_FREEZINGTREE = (1<< 5); // in process of freezing
- static const unsigned STATE_FROZENDIR = (1<< 6);
- static const unsigned STATE_FREEZINGDIR = (1<< 7);
- static const unsigned STATE_COMMITTING = (1<< 8); // mid-commit
- static const unsigned STATE_FETCHING = (1<< 9); // currenting fetching
- static const unsigned STATE_DELETED = (1<<10);
- static const unsigned STATE_EXPORT = (1<<12);
- static const unsigned STATE_IMPORTBOUND = (1<<13);
- static const unsigned STATE_EXPORTBOUND = (1<<14);
- static const unsigned STATE_EXPORTING = (1<<15);
- static const unsigned STATE_IMPORTING = (1<<16);
- static const unsigned STATE_FRAGMENTING = (1<<17);
- static const unsigned STATE_STICKY = (1<<18); // sticky pin due to inode stickydirs
+ static const unsigned STATE_COMPLETE = (1<< 1); // the complete contents are in cache
+ static const unsigned STATE_FROZENTREE = (1<< 2); // root of tree (bounded by exports)
+ static const unsigned STATE_FREEZINGTREE = (1<< 3); // in process of freezing
+ static const unsigned STATE_FROZENDIR = (1<< 4);
+ static const unsigned STATE_FREEZINGDIR = (1<< 5);
+ static const unsigned STATE_COMMITTING = (1<< 6); // mid-commit
+ static const unsigned STATE_FETCHING = (1<< 7); // currenting fetching
+ static const unsigned STATE_DELETED = (1<< 8);
+ static const unsigned STATE_EXPORT = (1<< 9);
+ static const unsigned STATE_IMPORTBOUND = (1<<10);
+ static const unsigned STATE_EXPORTBOUND = (1<<11);
+ static const unsigned STATE_EXPORTING = (1<<12);
+ static const unsigned STATE_IMPORTING = (1<<13);
+ static const unsigned STATE_FRAGMENTING = (1<<14);
+ static const unsigned STATE_STICKY = (1<<15); // sticky pin due to inode stickydirs
+ static const unsigned STATE_DNPINNEDFRAG = (1<<16); // dir is refragmenting
// common states
static const unsigned STATE_CLEAN = 0;
static const unsigned MASK_STATE_FRAGMENT_KEPT =
(STATE_DIRTY |
STATE_COMPLETE |
- STATE_FROZENDIR |
STATE_EXPORT |
STATE_EXPORTBOUND |
- STATE_IMPORTBOUND |
- STATE_STICKY);
+ STATE_IMPORTBOUND);
// -- rep spec --
static const int REP_NONE = 0;
protected:
// contents
- CDir_map_t items; // non-null AND null
- size_t nitems; // # non-null
- size_t nnull; // # null
+ CDir_map_t items; // non-null AND null
+ unsigned nitems; // # non-null
+ unsigned nnull; // # null
int num_dirty;
// state
- version_t version;
- version_t committing_version;
- version_t committed_version;
- version_t committed_version_equivalent; // in case of, e.g., temporary file
- version_t projected_version;
+ version_t version;
+ version_t committing_version;
+ version_t committed_version;
+ version_t committed_version_equivalent; // in case of, e.g., temporary file
+ version_t projected_version;
// lock nesting, freeze
- int auth_pins;
- int nested_auth_pins;
- int request_pins;
+ int auth_pins;
+ int nested_auth_pins;
+ int request_pins;
// cache control (defined for authority; hints for replicas)
- int dir_rep;
- set<int> dir_rep_by; // if dir_rep == REP_LIST
+ int dir_rep;
+ set<int> dir_rep_by; // if dir_rep == REP_LIST
// popularity
- meta_load_t popularity[MDS_NPOP];
+ dirfrag_load_vec_t pop_me;
+ dirfrag_load_vec_t pop_nested;
+ dirfrag_load_vec_t pop_auth_subtree;
+ dirfrag_load_vec_t pop_auth_subtree_nested;
+
+ utime_t last_popularity_sample;
// friends
friend class Migrator;
CDir_map_t::iterator begin() { return items.begin(); }
CDir_map_t::iterator end() { return items.end(); }
- size_t get_size() {
+ unsigned get_size() {
return nitems;
}
- size_t get_nitems() { return nitems; }
- size_t get_nnull() { return nnull; }
+ unsigned get_nitems() { return nitems; }
+ unsigned get_nnull() { return nnull; }
void inc_num_dirty() { num_dirty++; }
void dec_num_dirty() {
return iter->second;
}
- CDentry* add_dentry( const string& dname, CInode *in=0 );
- CDentry* add_dentry( const string& dname, inodeno_t ino );
+ CDentry* add_null_dentry(const string& dname);
+ CDentry* add_primary_dentry(const string& dname, CInode *in);
+ CDentry* add_remote_dentry(const string& dname, inodeno_t ino, unsigned char d_type);
void remove_dentry( CDentry *dn ); // delete dentry
- void link_inode( CDentry *dn, inodeno_t ino );
- void link_inode( CDentry *dn, CInode *in );
+ void link_remote_inode( CDentry *dn, inodeno_t ino, unsigned char d_type);
+ void link_primary_inode( CDentry *dn, CInode *in );
void unlink_inode( CDentry *dn );
void try_remove_unlinked_dn(CDentry *dn);
private:
public:
pair<int,int> authority();
pair<int,int> get_dir_auth() { return dir_auth; }
- void set_dir_auth(pair<int,int> a, bool iamauth=false);
- void set_dir_auth(int a) {
- set_dir_auth(pair<int,int>(a, CDIR_AUTH_UNKNOWN), false);
- }
+ void set_dir_auth(pair<int,int> a);
+ void set_dir_auth(int a) { set_dir_auth(pair<int,int>(a, CDIR_AUTH_UNKNOWN)); }
bool is_ambiguous_dir_auth() {
return dir_auth.second != CDIR_AUTH_UNKNOWN;
}
// for giving to clients
void get_dist_spec(set<int>& ls, int auth) {
- if (( popularity[MDS_POP_CURDOM].pop[META_POP_IRD].get() >
+ if (( pop_auth_subtree.get(META_POP_IRD).get() >
g_conf.mds_bal_replicate_threshold)) {
//if (!cached_by.empty() && inode.ino > 1) dout(1) << "distributed spec for " << *this << endl;
for (map<int,int>::iterator p = replicas_begin();
// -- fetch --
object_t get_ondisk_object() { return object_t(ino(), frag); }
- void fetch(Context *c);
+ void fetch(Context *c, bool ignore_authpinnability=false);
void _fetched(bufferlist &bl);
// -- commit --
void auth_pin();
void auth_unpin();
void adjust_nested_auth_pins(int inc);
- void on_freezeable();
// -- freezing --
void freeze_tree(Context *c);
void freeze_tree_finish(Context *c);
void unfreeze_tree();
- void _freeze_tree(Context *c=0);
+ void _freeze_tree();
void freeze_dir(Context *c);
void freeze_dir_finish(Context *c);
- void _freeze_dir(Context *c=0);
+ void _freeze_dir();
void unfreeze_dir();
bool is_freezing() { return is_freezing_tree() || is_freezing_dir(); }
bool is_frozen_tree_root() { return state & STATE_FROZENTREE; }
bool is_frozen_dir() { return state & STATE_FROZENDIR; }
- bool is_freezeable() {
+ bool is_freezeable(bool freezing=false) {
// no nested auth pins.
- if (auth_pins > 0 || nested_auth_pins > 0)
+ if ((auth_pins-freezing) > 0 || nested_auth_pins > 0)
return false;
// inode must not be frozen.
return true;
}
- bool is_freezeable_dir() {
- if (auth_pins > 0)
+ bool is_freezeable_dir(bool freezing=false) {
+ if ((auth_pins-freezing) > 0)
return false;
- // if not subtree root, inode must not be frozen.
- if (!is_subtree_root() && inode->is_frozen())
+ // if not subtree root, inode must not be frozen (tree--frozen_dir is okay).
+ if (!is_subtree_root() && inode->is_frozen() && !inode->is_frozen_dir())
return false;
return true;
uint32_t nden; // num dentries (including null ones)
version_t version;
version_t committed_version;
+ version_t committed_version_equivalent;
uint32_t state;
- meta_load_t popularity_justme;
- meta_load_t popularity_curdom;
+ dirfrag_load_vec_t pop_me;
+ dirfrag_load_vec_t pop_auth_subtree;
int32_t dir_rep;
} st;
map<int,int> replicas;
public:
CDirExport() {}
- CDirExport(CDir *dir) {
+ CDirExport(CDir *dir, utime_t now) {
memset(&st, 0, sizeof(st));
assert(dir->get_version() == dir->get_projected_version());
st.nden = dir->items.size();
st.version = dir->version;
st.committed_version = dir->committed_version;
+ st.committed_version_equivalent = dir->committed_version_equivalent;
st.state = dir->state;
st.dir_rep = dir->dir_rep;
-
- st.popularity_justme.take( dir->popularity[MDS_POP_JUSTME] );
- st.popularity_curdom.take( dir->popularity[MDS_POP_CURDOM] );
- dir->popularity[MDS_POP_ANYDOM] -= st.popularity_curdom;
- dir->popularity[MDS_POP_NESTED] -= st.popularity_curdom;
+
+ st.pop_me = dir->pop_me;
+ st.pop_auth_subtree = dir->pop_auth_subtree;
+ dir->pop_auth_subtree_nested -= dir->pop_auth_subtree;
+ dir->pop_me.zero(now);
+ dir->pop_auth_subtree.zero(now);
rep_by = dir->dir_rep_by;
replicas = dir->replica_map;
assert(dir->dirfrag() == st.dirfrag);
// set committed_version at old version
- dir->committing_version = dir->committed_version = st.committed_version;
- dir->projected_version = dir->version = st.version;
+ dir->committing_version =
+ dir->committed_version = st.committed_version;
+ dir->committed_version_equivalent = st.committed_version_equivalent;
+ dir->projected_version =
+ dir->version = st.version;
// twiddle state
dir->state = (dir->state & CDir::MASK_STATE_IMPORT_KEPT) | // remember import flag, etc.
(st.state & CDir::MASK_STATE_EXPORTED);
dir->dir_rep = st.dir_rep;
- dir->popularity[MDS_POP_JUSTME] += st.popularity_justme;
- dir->popularity[MDS_POP_CURDOM] += st.popularity_curdom;
- dir->popularity[MDS_POP_ANYDOM] += st.popularity_curdom;
- dir->popularity[MDS_POP_NESTED] += st.popularity_curdom;
+ dir->pop_me = st.pop_me;
+ dir->pop_auth_subtree = st.pop_auth_subtree;
+ dir->pop_auth_subtree_nested += dir->pop_auth_subtree;
dir->replica_nonce = 0; // no longer defined
}
if (in.is_symlink()) out << " symlink";
-
+ if (in.is_dir() && !in.dirfragtree.empty()) out << " " << in.dirfragtree;
+
out << " v" << in.get_version();
// locks
ls.push_back(dirfrags[*p]);
}
+CDir *CInode::get_approx_dirfrag(frag_t fg)
+{
+ CDir *dir = get_dirfrag(fg);
+ if (dir) return dir;
+
+ // find a child?
+ list<CDir*> ls;
+ get_dirfrags_under(fg, ls);
+ if (!ls.empty())
+ return ls.front();
+
+ // try parents?
+ while (1) {
+ fg = fg.parent();
+ dir = get_dirfrag(fg);
+ if (dir) return dir;
+ }
+}
+
void CInode::get_dirfrags(list<CDir*>& ls)
{
// all dirfrags
-void CInode::fragment_dir(frag_t basefrag, int bits, list<CDir*>& subs, list<Context*>& waiters)
-{
- dout(10) << "fragment_dir " << bits << endl;
-
- CDir *base = get_or_open_dirfrag(mdcache, basefrag);
-
- dirfragtree.split(basefrag, bits);
- if (bits > 0) {
- base->split(bits, subs, waiters);
- } else {
- base->merge(bits, waiters);
- }
-}
-
// pins
void CInode::name_stray_dentry(string& dname)
{
char s[20];
+#ifdef __LP64__
sprintf(s, "%ld", inode.ino.val);
+#else
+ sprintf(s, "%lld", inode.ino.val);
+#endif
dname = s;
}
break;
case LOCK_OTYPE_IDIRFRAGTREE:
- dirfragtree._encode(bl);
+ {
+ // encode the raw tree
+ dirfragtree._encode(bl);
+
+ // also specify which frags are mine
+ set<frag_t> myfrags;
+ list<CDir*> dfls;
+ get_dirfrags(dfls);
+ for (list<CDir*>::iterator p = dfls.begin(); p != dfls.end(); ++p)
+ if ((*p)->is_auth())
+ myfrags.insert((*p)->get_frag());
+ _encode(myfrags, bl);
+ }
break;
case LOCK_OTYPE_IFILE:
break;
case LOCK_OTYPE_IDIRFRAGTREE:
- dirfragtree._decode(bl, off);
+ {
+ fragtree_t temp;
+ temp._decode(bl, off);
+ set<frag_t> authfrags;
+ _decode(authfrags, bl, off);
+ if (is_auth()) {
+ // auth. believe replica's auth frags only.
+ for (set<frag_t>::iterator p = authfrags.begin(); p != authfrags.end(); ++p)
+ dirfragtree.force_to_leaf(*p);
+ } else {
+ // replica. just take the tree.
+ dirfragtree.swap(temp);
+ }
+ }
break;
case LOCK_OTYPE_IFILE:
// auth_pins
bool CInode::can_auth_pin() {
if (parent)
- return parent->dir->can_auth_pin();
+ return parent->can_auth_pin();
return true;
}
dout(7) << "auth_pin on " << *this << " count now " << auth_pins << " + " << nested_auth_pins << endl;
if (parent)
- parent->dir->adjust_nested_auth_pins( 1 );
+ parent->adjust_nested_auth_pins( 1 );
}
void CInode::auth_unpin()
assert(auth_pins >= 0);
if (parent)
- parent->dir->adjust_nested_auth_pins( -1 );
+ parent->adjust_nested_auth_pins( -1 );
}
void CInode::adjust_nested_auth_pins(int a)
{
if (!parent) return;
nested_auth_pins += a;
- parent->get_dir()->adjust_nested_auth_pins(a);
+ parent->adjust_nested_auth_pins(a);
}
pair<int,int> CInode::authority()
{
- //if (is_root())
- //return CDIR_AUTH_ROOTINODE; // root _inode_ is locked to mds0.
if (force_auth.first >= 0)
return force_auth;
class CInode : public MDSCacheObject {
public:
// -- pins --
- //static const int PIN_REPLICATED = 1;
- static const int PIN_DIR = 2;
- static const int PIN_CAPS = 7; // client caps
- static const int PIN_AUTHPIN = 8;
- static const int PIN_IMPORTING = -9; // importing
- static const int PIN_ANCHORING = 12;
- static const int PIN_UNANCHORING = 13;
- static const int PIN_OPENINGDIR = 14;
- static const int PIN_REMOTEPARENT = 15;
- static const int PIN_BATCHOPENJOURNAL = 16;
- static const int PIN_SCATTERED = 17;
- static const int PIN_STICKYDIRS = 18;
+ static const int PIN_DIRFRAG = -1;
+ static const int PIN_CAPS = 2; // client caps
+ static const int PIN_IMPORTING = -4; // importing
+ static const int PIN_ANCHORING = 5;
+ static const int PIN_UNANCHORING = 6;
+ static const int PIN_OPENINGDIR = 7;
+ static const int PIN_REMOTEPARENT = 8;
+ static const int PIN_BATCHOPENJOURNAL = 9;
+ static const int PIN_SCATTERED = 10;
+ static const int PIN_STICKYDIRS = 11;
const char *pin_name(int p) {
switch (p) {
- case PIN_DIR: return "dir";
+ case PIN_DIRFRAG: return "dirfrag";
case PIN_CAPS: return "caps";
- case PIN_AUTHPIN: return "authpin";
case PIN_IMPORTING: return "importing";
case PIN_ANCHORING: return "anchoring";
case PIN_UNANCHORING: return "unanchoring";
fragtree_t dirfragtree; // dir frag tree, if any. always consistent with our dirfrag map.
//map<frag_t,int> dirfrag_size; // size of each dirfrag
- off_t last_open_journaled; // log offset for the last journaled EOpen
+ off_t last_journaled; // log offset for the last time i was journaled
+ off_t last_open_journaled; // log offset for the last journaled EOpen
// projected values (only defined while dirty)
list<inode_t*> projected_inode;
frag_t pick_dirfrag(const string &dn);
bool has_dirfrags() { return !dirfrags.empty(); }
CDir* get_dirfrag(frag_t fg) {
- if (dirfrags.count(fg))
+ if (dirfrags.count(fg)) {
+ assert(dirfragtree.is_leaf(fg));
return dirfrags[fg];
- else
+ } else
return 0;
}
void get_dirfrags_under(frag_t fg, list<CDir*>& ls);
+ CDir* get_approx_dirfrag(frag_t fg);
void get_dirfrags(list<CDir*>& ls);
void get_nested_dirfrags(list<CDir*>& ls);
void get_subtree_dirfrags(list<CDir*>& ls);
void get_stickydirs();
void put_stickydirs();
- void fragment_dir(frag_t basefrag, int bits, list<CDir*>& subs, list<Context*>& waiters);
-
protected:
// parent dentries in cache
CDentry *parent; // primary link
int nested_auth_pins;
public:
- meta_load_t popularity[MDS_NPOP];
+ inode_load_vec_t pop;
// friends
friend class Server;
// ---------------------------
CInode(MDCache *c, bool auth=true) :
mdcache(c),
- last_open_journaled(0),
+ last_journaled(0), last_open_journaled(0),
stickydir_ref(0),
parent(0), force_auth(CDIR_AUTH_DEFAULT),
replica_caps_wanted(0),
return ino() < ((CInode*)r)->ino();
}
-
-
// -- misc --
void make_path(string& s);
void make_anchor_trace(vector<class Anchor>& trace);
LocalLock versionlock;
SimpleLock authlock;
SimpleLock linklock;
- SimpleLock dirfragtreelock;
+ ScatterLock dirfragtreelock;
FileLock filelock;
ScatterLock dirlock;
case LOCK_OTYPE_ILINK: return &linklock;
case LOCK_OTYPE_IDIRFRAGTREE: return &dirfragtreelock;
case LOCK_OTYPE_IDIR: return &dirlock;
- default: assert(0);
+ default: assert(0); return 0;
}
}
void set_object_info(MDSCacheObjectInfo &info);
struct st_ {
inode_t inode;
- meta_load_t popularity_justme;
- meta_load_t popularity_curdom;
+ inode_load_vec_t pop;
+
bool is_dirty; // dirty inode?
int num_caps;
public:
CInodeExport() {}
- CInodeExport(CInode *in) {
+ CInodeExport(CInode *in, utime_t now) {
st.inode = in->inode;
symlink = in->symlink;
dirfragtree = in->dirfragtree;
in->filelock._encode(locks);
in->dirlock._encode(locks);
- st.popularity_justme.take( in->popularity[MDS_POP_JUSTME] );
- st.popularity_curdom.take( in->popularity[MDS_POP_CURDOM] );
- in->popularity[MDS_POP_ANYDOM] -= st.popularity_curdom;
- in->popularity[MDS_POP_NESTED] -= st.popularity_curdom;
+ st.pop = in->pop;
+ in->pop.zero(now);
// steal WRITER caps from inode
in->take_client_caps(cap_map);
in->symlink = symlink;
in->dirfragtree = dirfragtree;
- in->popularity[MDS_POP_JUSTME] += st.popularity_justme;
- in->popularity[MDS_POP_CURDOM] += st.popularity_curdom;
- in->popularity[MDS_POP_ANYDOM] += st.popularity_curdom;
- in->popularity[MDS_POP_NESTED] += st.popularity_curdom;
+ in->pop = st.pop;
if (st.is_dirty)
in->_mark_dirty();
last_sent(s),
last_recv(s),
suppress(false) {
- //cap_history[last_sent] = 0;
}
Capability(Export& other) :
wanted_caps(other.wanted),
case LOCK_LONER: return "loner";
case LOCK_GLONERR: return "glonerr";
case LOCK_GLONERM: return "glonerm";
- default: assert(0);
+ default: assert(0); return 0;
}
}
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-
-
-// =======================================================================
-// HASHING
-
-
-void Migrator::import_hashed_content(CDir *dir, bufferlist& bl, int nden, int oldauth)
-{
- int off = 0;
-
- for (; nden>0; nden--) {
- // dentry
- string dname;
- _decode(dname, bl, off);
- dout(15) << "dname is " << dname << endl;
-
- char icode;
- bl.copy(off, 1, &icode);
- off++;
-
- CDentry *dn = dir->lookup(dname);
- if (!dn)
- dn = dir->add_dentry(dname); // null
-
- // mark dn dirty _after_ we link the inode (scroll down)
-
- if (icode == 'N') {
-
- // null dentry
- assert(dn->is_null());
-
- // fall thru
- }
- else if (icode == 'L') {
- // remote link
- inodeno_t ino;
- bl.copy(off, sizeof(ino), (char*)&ino);
- off += sizeof(ino);
- dir->link_inode(dn, ino);
- }
- else if (icode == 'I') {
- // inode
- decode_import_inode(dn, bl, off, oldauth);
-
- // fix up subdir export?
- if (dn->inode->dir) {
- assert(dn->inode->dir->state_test(CDIR_STATE_IMPORTBOUND));
- dn->inode->dir->put(CDir::PIN_IMPORTBOUND);
- dn->inode->dir->state_clear(CDIR_STATE_IMPORTBOUND);
-
- if (dn->inode->dir->is_auth()) {
- // mine. must have been an import.
- assert(dn->inode->dir->is_import());
- dout(7) << "unimporting subdir now that inode is mine " << *dn->inode->dir << endl;
- dn->inode->dir->set_dir_auth( CDIR_AUTH_PARENT );
- cache->imports.erase(dn->inode->dir);
- dn->inode->dir->put(CDir::PIN_IMPORT);
- dn->inode->dir->state_clear(CDIR_STATE_IMPORT);
-
- // move nested under hashdir
- for (set<CDir*>::iterator it = cache->nested_exports[dn->inode->dir].begin();
- it != cache->nested_exports[dn->inode->dir].end();
- it++)
- cache->nested_exports[dir].insert(*it);
- cache->nested_exports.erase(dn->inode->dir);
-
- // now it matches the inode
- dn->inode->dir->set_dir_auth( CDIR_AUTH_PARENT );
- }
- else {
- // not mine. make it an export.
- dout(7) << "making subdir into export " << *dn->inode->dir << endl;
- dn->inode->dir->get(CDir::PIN_EXPORT);
- dn->inode->dir->state_set(CDIR_STATE_EXPORT);
- cache->exports.insert(dn->inode->dir);
- cache->nested_exports[dir].insert(dn->inode->dir);
-
- if (dn->inode->dir->get_dir_auth().first == CDIR_AUTH_PARENT)
- dn->inode->dir->set_dir_auth( oldauth ); // no longer matches inode
- assert(dn->inode->dir->get_dir_auth().first >= 0);
- }
- }
- }
-
- // mark dentry dirty? (only _after_ we link the inode!)
- dn->_mark_dirty(); // fixme
- }
-}
-
-/*
-
- notes on interaction of hashing and export/import:
-
- - dir->is_auth() is completely independent of hashing. for a hashed dir,
- - all nodes are partially authoritative
- - all nodes dir->is_hashed() == true
- - all nodes dir->inode->dir_is_hashed() == true
- - one node dir->is_auth() == true, the rest == false
- - dir_auth for all subdirs in a hashed dir will (likely?) be explicit.
-
- - remember simple rule: dir auth follows inode, unless dir_auth is explicit.
-
- - export_dir_walk and decode_import_dir take care with dir_auth: (for import/export)
- - on export, -1 is changed to mds->get_nodeid()
- - on import, nothing special, actually.
-
- - hashed dir files aren't included in export; subdirs are converted to imports
- or exports as necessary.
- - hashed dir subdirs are discovered on export. this is important
- because dirs are needed to tie together auth hierarchy, for auth to know about
- imports/exports, etc.
-
- - dir state is maintained on auth.
- - COMPLETE and HASHED are transfered to importers.
- - DIRTY is set everywhere.
-
- - hashed dir is like an import: hashed dir used for nested_exports map.
- - nested_exports is updated appropriately on auth and replicas.
- - a subtree terminates as a hashed dir, since the hashing explicitly
- redelegates all inodes. thus export_dir_walk includes hashed dirs, but
- not their inodes.
-*/
-
-// HASH on auth
-
-class C_MDC_HashFreeze : public Context {
-public:
- Migrator *mig;
- CDir *dir;
- C_MDC_HashFreeze(Migrator *m, CDir *d) : mig(m), dir(d) {}
- virtual void finish(int r) {
- mig->hash_dir_frozen(dir);
- }
-};
-
-class C_MDC_HashComplete : public Context {
-public:
- Migrator *mig;
- CDir *dir;
- C_MDC_HashComplete(Migrator *mig, CDir *dir) {
- this->mig = mig;
- this->dir = dir;
- }
- virtual void finish(int r) {
- mig->hash_dir_complete(dir);
- }
-};
-
-
-/** hash_dir(dir)
- * start hashing a directory.
- */
-void Migrator::hash_dir(CDir *dir)
-{
- dout(-7) << "hash_dir " << *dir << endl;
-
- assert(!dir->is_hashed());
- assert(dir->is_auth());
-
- if (dir->is_frozen() ||
- dir->is_freezing()) {
- dout(7) << " can't hash, freezing|frozen." << endl;
- return;
- }
-
- // pin path?
- vector<CDentry*> trace;
- cache->make_trace(trace, dir->inode);
- if (!cache->path_pin(trace, 0, 0)) {
- dout(7) << "hash_dir couldn't pin path, failing." << endl;
- return;
- }
-
- // ok, go
- dir->state_set(CDIR_STATE_HASHING);
- dir->get(CDir::PIN_HASHING);
- assert(dir->hashed_subset.empty());
-
- // discover on all mds
- assert(hash_gather.count(dir) == 0);
- for (int i=0; i<mds->get_mds_map()->get_num_mds(); i++) {
- if (i == mds->get_nodeid()) continue; // except me
- hash_gather[dir].insert(i);
- mds->send_message_mds(new MHashDirDiscover(dir->inode), i, MDS_PORT_MIGRATOR);
- }
- dir->auth_pin(); // pin until discovers are all acked.
-
- // start freeze
- dir->freeze_dir(new C_MDC_HashFreeze(this, dir));
-
- // make complete
- if (!dir->is_complete()) {
- dout(7) << "hash_dir " << *dir << " not complete, fetching" << endl;
- mds->mdstore->fetch_dir(dir,
- new C_MDC_HashComplete(this, dir));
- } else
- hash_dir_complete(dir);
-}
-
-
-/*
- * wait for everybody to discover and open the hashing dir
- * then auth_unpin, to let the freeze happen
- */
-void Migrator::handle_hash_dir_discover_ack(MHashDirDiscoverAck *m)
-{
- CInode *in = cache->get_inode(m->get_ino());
- assert(in);
- CDir *dir = in->dir;
- assert(dir);
-
- int from = m->get_source().num();
- assert(hash_gather[dir].count(from));
- hash_gather[dir].erase(from);
-
- if (hash_gather[dir].empty()) {
- hash_gather.erase(dir);
- dout(7) << "hash_dir_discover_ack " << *dir << ", releasing auth_pin" << endl;
- dir->auth_unpin(); // unpin to allow freeze to complete
- } else {
- dout(7) << "hash_dir_discover_ack " << *dir << ", still waiting for " << hash_gather[dir] << endl;
- }
-
- delete m; // done
-}
-
-
-
-/*
- * once the dir is completely in memory,
- * mark all migrating inodes dirty (to pin in cache)
- */
-void Migrator::hash_dir_complete(CDir *dir)
-{
- dout(7) << "hash_dir_complete " << *dir << ", dirtying inodes" << endl;
-
- assert(!dir->is_hashed());
- assert(dir->is_auth());
-
- // mark dirty to pin in cache
- for (CDir_map_t::iterator it = dir->begin();
- it != dir->end();
- it++) {
- CInode *in = it->second->inode;
- in->_mark_dirty(); // fixme
- }
-
- if (dir->is_frozen_dir())
- hash_dir_go(dir);
-}
-
-
-/*
- * once the dir is frozen,
- * make sure it's complete
- * send the prep messages!
- */
-void Migrator::hash_dir_frozen(CDir *dir)
-{
- dout(7) << "hash_dir_frozen " << *dir << endl;
-
- assert(!dir->is_hashed());
- assert(dir->is_auth());
- assert(dir->is_frozen_dir());
-
- if (!dir->is_complete()) {
- dout(7) << "hash_dir_frozen !complete, waiting still on " << *dir << endl;
- return;
- }
-
- // send prep messages w/ export directories to open
- vector<MHashDirPrep*> msgs(mds->get_mds_map()->get_num_mds());
-
- // check for subdirs
- for (CDir_map_t::iterator it = dir->begin();
- it != dir->end();
- it++) {
- CDentry *dn = it->second;
- CInode *in = dn->inode;
-
- if (!in->is_dir()) continue;
- if (!in->dir) continue;
-
- int dentryhashcode = mds->mdcache->hash_dentry( dir->ino(), it->first );
- if (dentryhashcode == mds->get_nodeid()) continue;
-
- // msg?
- if (msgs[dentryhashcode] == 0) {
- msgs[dentryhashcode] = new MHashDirPrep(dir->ino());
- }
- msgs[dentryhashcode]->add_inode(it->first, in->replicate_to(dentryhashcode));
- }
-
- // send them!
- assert(hash_gather[dir].empty());
- for (unsigned i=0; i<msgs.size(); i++) {
- if (msgs[i]) {
- mds->send_message_mds(msgs[i], i, MDS_PORT_MIGRATOR);
- hash_gather[dir].insert(i);
- }
- }
-
- if (hash_gather[dir].empty()) {
- // no subdirs! continue!
- hash_gather.erase(dir);
- hash_dir_go(dir);
- } else {
- // wait!
- }
-}
-
-/*
- * wait for peers to open all subdirs
- */
-void Migrator::handle_hash_dir_prep_ack(MHashDirPrepAck *m)
-{
- CInode *in = cache->get_inode(m->get_ino());
- assert(in);
- CDir *dir = in->dir;
- assert(dir);
-
- int from = m->get_source().num();
-
- assert(hash_gather[dir].count(from) == 1);
- hash_gather[dir].erase(from);
-
- if (hash_gather[dir].empty()) {
- hash_gather.erase(dir);
- dout(7) << "handle_hash_dir_prep_ack on " << *dir << ", last one" << endl;
- hash_dir_go(dir);
- } else {
- dout(7) << "handle_hash_dir_prep_ack on " << *dir << ", waiting for " << hash_gather[dir] << endl;
- }
-
- delete m;
-}
-
-
-/*
- * once the dir is frozen,
- * make sure it's complete
- * do the hashing!
- */
-void Migrator::hash_dir_go(CDir *dir)
-{
- dout(7) << "hash_dir_go " << *dir << endl;
-
- assert(!dir->is_hashed());
- assert(dir->is_auth());
- assert(dir->is_frozen_dir());
-
- // get messages to other nodes ready
- vector<MHashDir*> msgs(mds->get_mds_map()->get_num_mds());
- for (int i=0; i<mds->get_mds_map()->get_num_mds(); i++) {
- if (i == mds->get_nodeid()) continue;
- msgs[i] = new MHashDir(dir->ino());
- }
-
- // pick a hash seed.
- dir->inode->inode.hash_seed = 1;//dir->ino();
-
- // suck up all waiters
- C_Contexts *fin = new C_Contexts;
- list<Context*> waiting;
- dir->take_waiting(CDIR_WAIT_ANY, waiting); // all dir waiters
- fin->take(waiting);
-
- // get containing import. might be me.
- CDir *containing_import = cache->get_auth_container(dir);
- assert(containing_import != dir || dir->is_import());
-
- // divy up contents
- for (CDir_map_t::iterator it = dir->begin();
- it != dir->end();
- it++) {
- CDentry *dn = it->second;
- CInode *in = dn->inode;
-
- int dentryhashcode = mds->mdcache->hash_dentry( dir->ino(), it->first );
- if (dentryhashcode == mds->get_nodeid()) {
- continue; // still mine!
- }
-
- bufferlist *bl = msgs[dentryhashcode]->get_state_ptr();
- assert(bl);
-
- // -- dentry
- dout(7) << "hash_dir_go sending to " << dentryhashcode << " dn " << *dn << endl;
- _encode(it->first, *bl);
-
- // null dentry?
- if (dn->is_null()) {
- bl->append("N", 1); // null dentry
- assert(dn->is_sync());
- continue;
- }
-
- if (dn->is_remote()) {
- // remote link
- bl->append("L", 1); // remote link
-
- inodeno_t ino = dn->get_remote_ino();
- bl->append((char*)&ino, sizeof(ino));
- continue;
- }
-
- // primary link
- // -- inode
- bl->append("I", 1); // inode dentry
-
- encode_export_inode(in, *bl, dentryhashcode); // encode, and (update state for) export
- msgs[dentryhashcode]->inc_nden();
-
- if (dn->is_dirty())
- dn->mark_clean();
-
- // add to proxy
- hash_proxy_inos[dir].push_back(in);
- in->state_set(CInode::STATE_PROXY);
- in->get(CInode::PIN_PROXY);
-
- // fix up subdirs
- if (in->dir) {
- if (in->dir->is_auth()) {
- // mine. make it into an import.
- dout(7) << "making subdir into import " << *in->dir << endl;
- in->dir->set_dir_auth( mds->get_nodeid() );
- cache->imports.insert(in->dir);
- in->dir->get(CDir::PIN_IMPORT);
- in->dir->state_set(CDIR_STATE_IMPORT);
-
- // fix nested bits
- for (set<CDir*>::iterator it = cache->nested_exports[containing_import].begin();
- it != cache->nested_exports[containing_import].end(); ) {
- CDir *ex = *it;
- it++;
- if (cache->get_auth_container(ex) == in->dir) {
- dout(10) << "moving nested export " << *ex << endl;
- cache->nested_exports[containing_import].erase(ex);
- cache->nested_exports[in->dir].insert(ex);
- }
- }
- }
- else {
- // not mine.
- dout(7) << "un-exporting subdir that's being hashed away " << *in->dir << endl;
- assert(in->dir->is_export());
- in->dir->put(CDir::PIN_EXPORT);
- in->dir->state_clear(CDIR_STATE_EXPORT);
- cache->exports.erase(in->dir);
- cache->nested_exports[containing_import].erase(in->dir);
- if (in->dir->authority() == dentryhashcode)
- in->dir->set_dir_auth( CDIR_AUTH_PARENT );
- else
- in->dir->set_dir_auth( in->dir->authority() );
- }
- }
-
- // waiters
- list<Context*> waiters;
- in->take_waiting(CINODE_WAIT_ANY, waiters);
- fin->take(waiters);
- }
-
- // dir state
- dir->state_set(CDIR_STATE_HASHED);
- dir->get(CDir::PIN_HASHED);
- cache->hashdirs.insert(dir);
- dir->mark_dirty(dir->pre_dirty()); // fixme
- mds->mdlog->submit_entry(new EString("dirty dir fixme"));
-
- // inode state
- if (dir->inode->is_auth()) {
- dir->inode->_mark_dirty(); // fixme
- mds->mdlog->submit_entry(new EString("hash dirty fixme"));
- }
-
- // fix up nested_exports?
- if (containing_import != dir) {
- dout(7) << "moving nested exports under hashed dir" << endl;
- for (set<CDir*>::iterator it = cache->nested_exports[containing_import].begin();
- it != cache->nested_exports[containing_import].end(); ) {
- CDir *ex = *it;
- it++;
- if (cache->get_auth_container(ex) == dir) {
- dout(7) << " moving nested export under hashed dir: " << *ex << endl;
- cache->nested_exports[containing_import].erase(ex);
- cache->nested_exports[dir].insert(ex);
- } else {
- dout(7) << " NOT moving nested export under hashed dir: " << *ex << endl;
- }
- }
- }
-
- // send hash messages
- assert(hash_gather[dir].empty());
- assert(hash_notify_gather[dir].empty());
- assert(dir->hashed_subset.empty());
- for (int i=0; i<mds->get_mds_map()->get_num_mds(); i++) {
- // all nodes hashed locally..
- dir->hashed_subset.insert(i);
-
- if (i == mds->get_nodeid()) continue;
-
- // init hash_gather and hash_notify_gather sets
- hash_gather[dir].insert(i);
-
- assert(hash_notify_gather[dir][i].empty());
- for (int j=0; j<mds->get_mds_map()->get_num_mds(); j++) {
- if (j == mds->get_nodeid()) continue;
- if (j == i) continue;
- hash_notify_gather[dir][i].insert(j);
- }
-
- mds->send_message_mds(msgs[i], i, MDS_PORT_MIGRATOR);
- }
-
- // wait for all the acks.
-}
-
-
-void Migrator::handle_hash_dir_ack(MHashDirAck *m)
-{
- CInode *in = cache->get_inode(m->get_ino());
- assert(in);
- CDir *dir = in->dir;
- assert(dir);
-
- assert(dir->is_hashed());
- assert(dir->is_hashing());
-
- int from = m->get_source().num();
- assert(hash_gather[dir].count(from) == 1);
- hash_gather[dir].erase(from);
-
- if (hash_gather[dir].empty()) {
- dout(7) << "handle_hash_dir_ack on " << *dir << ", last one" << endl;
-
- if (hash_notify_gather[dir].empty()) {
- dout(7) << "got notifies too, all done" << endl;
- hash_dir_finish(dir);
- } else {
- dout(7) << "waiting on notifies " << endl;
- }
-
- } else {
- dout(7) << "handle_hash_dir_ack on " << *dir << ", waiting for " << hash_gather[dir] << endl;
- }
-
- delete m;
-}
-
-
-void Migrator::hash_dir_finish(CDir *dir)
-{
- dout(7) << "hash_dir_finish finishing " << *dir << endl;
- assert(dir->is_hashed());
- assert(dir->is_hashing());
-
- // dir state
- hash_gather.erase(dir);
- dir->state_clear(CDIR_STATE_HASHING);
- dir->put(CDir::PIN_HASHING);
- dir->hashed_subset.clear();
-
- // unproxy inodes
- // this _could_ happen sooner, on a per-peer basis, but no harm in waiting a few more seconds.
- for (list<CInode*>::iterator it = hash_proxy_inos[dir].begin();
- it != hash_proxy_inos[dir].end();
- it++) {
- CInode *in = *it;
- assert(in->state_test(CInode::STATE_PROXY));
- in->state_clear(CInode::STATE_PROXY);
- in->put(CInode::PIN_PROXY);
- }
- hash_proxy_inos.erase(dir);
-
- // unpin path
- vector<CDentry*> trace;
- cache->make_trace(trace, dir->inode);
- cache->path_unpin(trace, 0);
-
- // unfreeze
- dir->unfreeze_dir();
-
- show_imports();
- assert(hash_gather.count(dir) == 0);
-
- // stats
- //if (mds->logger) mds->logger->inc("nh", 1);
-
-}
-
-
-
-
-// HASH on auth and non-auth
-
-void Migrator::handle_hash_dir_notify(MHashDirNotify *m)
-{
- CInode *in = cache->get_inode(m->get_ino());
- assert(in);
- CDir *dir = in->dir;
- assert(dir);
- assert(dir->is_hashing());
-
- dout(5) << "handle_hash_dir_notify " << *dir << endl;
- int from = m->get_from();
-
- int source = m->get_source().num();
- if (dir->is_auth()) {
- // gather notifies
- assert(dir->is_hashed());
-
- assert( hash_notify_gather[dir][from].count(source) );
- hash_notify_gather[dir][from].erase(source);
-
- if (hash_notify_gather[dir][from].empty()) {
- dout(7) << "last notify from " << from << endl;
- hash_notify_gather[dir].erase(from);
-
- if (hash_notify_gather[dir].empty()) {
- dout(7) << "last notify!" << endl;
- hash_notify_gather.erase(dir);
-
- if (hash_gather[dir].empty()) {
- dout(7) << "got acks too, all done" << endl;
- hash_dir_finish(dir);
- } else {
- dout(7) << "still waiting on acks from " << hash_gather[dir] << endl;
- }
- } else {
- dout(7) << "still waiting for notify gathers from " << hash_notify_gather[dir].size() << " others" << endl;
- }
- } else {
- dout(7) << "still waiting for notifies from " << from << " via " << hash_notify_gather[dir][from] << endl;
- }
-
- // delete msg
- delete m;
- } else {
- // update dir hashed_subset
- assert(dir->hashed_subset.count(from) == 0);
- dir->hashed_subset.insert(from);
-
- // update open subdirs
- for (CDir_map_t::iterator it = dir->begin();
- it != dir->end();
- it++) {
- CDentry *dn = it->second;
- CInode *in = dn->get_inode();
- if (!in) continue;
- if (!in->dir) continue;
-
- int dentryhashcode = mds->mdcache->hash_dentry( dir->ino(), it->first );
- if (dentryhashcode != from) continue; // we'll import these in a minute
-
- if (in->dir->authority() != dentryhashcode)
- in->dir->set_dir_auth( in->dir->authority() );
- else
- in->dir->set_dir_auth( CDIR_AUTH_PARENT );
- }
-
- // remove from notify gather set
- assert(hash_gather[dir].count(from));
- hash_gather[dir].erase(from);
-
- // last notify?
- if (hash_gather[dir].empty()) {
- dout(7) << "gathered all the notifies, finishing hash of " << *dir << endl;
- hash_gather.erase(dir);
-
- dir->state_clear(CDIR_STATE_HASHING);
- dir->put(CDir::PIN_HASHING);
- dir->hashed_subset.clear();
- } else {
- dout(7) << "still waiting for notify from " << hash_gather[dir] << endl;
- }
-
- // fw notify to auth
- mds->send_message_mds(m, dir->authority(), MDS_PORT_MIGRATOR);
- }
-}
-
-
-
-
-// HASH on non-auth
-
-/*
- * discover step:
- * each peer needs to open up the directory and pin it before we start
- */
-class C_MDC_HashDirDiscover : public Context {
- Migrator *mig;
- MHashDirDiscover *m;
-public:
- vector<CDentry*> trace;
- C_MDC_HashDirDiscover(Migrator *mig, MHashDirDiscover *m) {
- this->mig = mig;
- this->m = m;
- }
- void finish(int r) {
- CInode *in = 0;
- if (r >= 0) {
- if (trace.size())
- in = trace[trace.size()-1]->get_inode();
- else
- in = mig->cache->get_root();
- }
- mig->handle_hash_dir_discover_2(m, in, r);
- }
-};
-
-void Migrator::handle_hash_dir_discover(MHashDirDiscover *m)
-{
- assert(m->get_source().num() != mds->get_nodeid());
-
- dout(7) << "handle_hash_dir_discover on " << m->get_path() << endl;
-
- // must discover it!
- C_MDC_HashDirDiscover *onfinish = new C_MDC_HashDirDiscover(this, m);
- filepath fpath(m->get_path());
- cache->path_traverse(fpath, onfinish->trace, true,
- m, new C_MDS_RetryMessage(mds,m), // on delay/retry
- MDS_TRAVERSE_DISCOVER,
- onfinish); // on completion|error
-}
-
-void Migrator::handle_hash_dir_discover_2(MHashDirDiscover *m, CInode *in, int r)
-{
- // yay!
- if (in) {
- dout(7) << "handle_hash_dir_discover_2 has " << *in << endl;
- }
-
- if (r < 0 || !in->is_dir()) {
- dout(7) << "handle_hash_dir_discover_2 failed to discover or not dir " << m->get_path() << ", NAK" << endl;
- assert(0); // this shouldn't happen if the auth pins his path properly!!!!
- }
- assert(in->is_dir());
-
- // is dir open?
- if (!in->dir) {
- dout(7) << "handle_hash_dir_discover_2 opening dir " << *in << endl;
- cache->open_remote_dir(in,
- new C_MDS_RetryMessage(mds, m));
- return;
- }
- CDir *dir = in->dir;
-
- // pin dir, set hashing flag
- dir->state_set(CDIR_STATE_HASHING);
- dir->get(CDir::PIN_HASHING);
- assert(dir->hashed_subset.empty());
-
- // inode state
- dir->inode->inode.hash_seed = 1;// dir->ino();
- if (dir->inode->is_auth()) {
- dir->inode->_mark_dirty(); // fixme
- mds->mdlog->submit_entry(new EString("hash dirty fixme"));
- }
-
- // get gather set ready for notifies
- assert(hash_gather[dir].empty());
- for (int i=0; i<mds->get_mds_map()->get_num_mds(); i++) {
- if (i == mds->get_nodeid()) continue;
- if (i == dir->authority()) continue;
- hash_gather[dir].insert(i);
- }
-
- // reply
- dout(7) << " sending hash_dir_discover_ack on " << *dir << endl;
- mds->send_message_mds(new MHashDirDiscoverAck(dir->ino()),
- m->get_source().num(), MDS_PORT_MIGRATOR);
- delete m;
-}
-
-/*
- * prep step:
- * peers need to open up all subdirs of the hashed dir
- */
-
-void Migrator::handle_hash_dir_prep(MHashDirPrep *m)
-{
- CInode *in = cache->get_inode(m->get_ino());
- assert(in);
- CDir *dir = in->dir;
- assert(dir);
-
- dout(7) << "handle_hash_dir_prep " << *dir << endl;
-
- if (!m->did_assim()) {
- m->mark_assim(); // only do this the first time!
-
- // assimilate dentry+inodes for exports
- for (map<string,CInodeDiscover*>::iterator it = m->get_inodes().begin();
- it != m->get_inodes().end();
- it++) {
- CInode *in = cache->get_inode( it->second->get_ino() );
- if (in) {
- it->second->update_inode(in);
- dout(5) << " updated " << *in << endl;
- } else {
- in = new CInode(mds->mdcache, false);
- it->second->update_inode(in);
- cache->add_inode(in);
-
- // link
- dir->add_dentry( it->first, in );
- dout(5) << " added " << *in << endl;
- }
-
- // open!
- if (!in->dir) {
- dout(5) << " opening nested export on " << *in << endl;
- cache->open_remote_dir(in,
- new C_MDS_RetryMessage(mds, m));
- }
- }
- }
-
- // verify!
- int waiting_for = 0;
- for (map<string,CInodeDiscover*>::iterator it = m->get_inodes().begin();
- it != m->get_inodes().end();
- it++) {
- CInode *in = cache->get_inode( it->second->get_ino() );
- assert(in);
-
- if (in->dir) {
- if (!in->dir->state_test(CDIR_STATE_IMPORTBOUND)) {
- dout(5) << " pinning nested export " << *in->dir << endl;
- in->dir->get(CDir::PIN_IMPORTBOUND);
- in->dir->state_set(CDIR_STATE_IMPORTBOUND);
- } else {
- dout(5) << " already pinned nested export " << *in << endl;
- }
- } else {
- dout(5) << " waiting for nested export dir on " << *in << endl;
- waiting_for++;
- }
- }
-
- if (waiting_for) {
- dout(5) << "waiting for " << waiting_for << " dirs to open" << endl;
- return;
- }
-
- // ack!
- mds->send_message_mds(new MHashDirPrepAck(dir->ino()),
- m->get_source().num(), MDS_PORT_MIGRATOR);
-
- // done.
- delete m;
-}
-
-
-/*
- * hash step:
- */
-
-void Migrator::handle_hash_dir(MHashDir *m)
-{
- CInode *in = cache->get_inode(m->get_ino());
- assert(in);
- CDir *dir = in->dir;
- assert(dir);
- assert(!dir->is_auth());
- assert(!dir->is_hashed());
- assert(dir->is_hashing());
-
- dout(5) << "handle_hash_dir " << *dir << endl;
- int oldauth = m->get_source().num();
-
- // content
- import_hashed_content(dir, m->get_state(), m->get_nden(), oldauth);
-
- // dir state
- dir->state_set(CDIR_STATE_HASHED);
- dir->get(CDir::PIN_HASHED);
- cache->hashdirs.insert(dir);
- dir->hashed_subset.insert(mds->get_nodeid());
-
- // dir is complete
- dir->mark_complete();
- dir->mark_dirty(dir->pre_dirty()); // fixme
- mds->mdlog->submit_entry(new EString("dirty dir fixme"));
-
- // commit
- mds->mdstore->commit_dir(dir, 0);
-
- // send notifies
- dout(7) << "sending notifies" << endl;
- for (int i=0; i<mds->get_mds_map()->get_num_mds(); i++) {
- if (i == mds->get_nodeid()) continue;
- if (i == m->get_source().num()) continue;
- mds->send_message_mds(new MHashDirNotify(dir->ino(), mds->get_nodeid()),
- i, MDS_PORT_MIGRATOR);
- }
-
- // ack
- dout(7) << "acking" << endl;
- mds->send_message_mds(new MHashDirAck(dir->ino()),
- m->get_source().num(), MDS_PORT_MIGRATOR);
-
- // done.
- delete m;
-
- show_imports();
-}
-
-
-
-
-
-// UNHASH on auth
-
-class C_MDC_UnhashFreeze : public Context {
-public:
- Migrator *mig;
- CDir *dir;
- C_MDC_UnhashFreeze(Migrator *m, CDir *d) : mig(m), dir(d) {}
- virtual void finish(int r) {
- mig->unhash_dir_frozen(dir);
- }
-};
-
-class C_MDC_UnhashComplete : public Context {
-public:
- Migrator *mig;
- CDir *dir;
- C_MDC_UnhashComplete(Migrator *m, CDir *d) : mig(m), dir(d) {}
- virtual void finish(int r) {
- mig->unhash_dir_complete(dir);
- }
-};
-
-
-void Migrator::unhash_dir(CDir *dir)
-{
- dout(-7) << "unhash_dir " << *dir << endl;
-
- assert(dir->is_hashed());
- assert(!dir->is_unhashing());
- assert(dir->is_auth());
- assert(hash_gather.count(dir)==0);
-
- // pin path?
- vector<CDentry*> trace;
- cache->make_trace(trace, dir->inode);
- if (!cache->path_pin(trace, 0, 0)) {
- dout(7) << "unhash_dir couldn't pin path, failing." << endl;
- return;
- }
-
- // twiddle state
- dir->state_set(CDIR_STATE_UNHASHING);
-
- // first, freeze the dir.
- dir->freeze_dir(new C_MDC_UnhashFreeze(this, dir));
-
- // make complete
- if (!dir->is_complete()) {
- dout(7) << "unhash_dir " << *dir << " not complete, fetching" << endl;
- mds->mdstore->fetch_dir(dir,
- new C_MDC_UnhashComplete(this, dir));
- } else
- unhash_dir_complete(dir);
-
-}
-
-void Migrator::unhash_dir_frozen(CDir *dir)
-{
- dout(7) << "unhash_dir_frozen " << *dir << endl;
-
- assert(dir->is_hashed());
- assert(dir->is_auth());
- assert(dir->is_frozen_dir());
-
- if (!dir->is_complete()) {
- dout(7) << "unhash_dir_frozen !complete, waiting still on " << *dir << endl;
- } else
- unhash_dir_prep(dir);
-}
-
-
-/*
- * ask peers to freeze and complete hashed dir
- */
-void Migrator::unhash_dir_prep(CDir *dir)
-{
- dout(7) << "unhash_dir_prep " << *dir << endl;
- assert(dir->is_hashed());
- assert(dir->is_auth());
- assert(dir->is_frozen_dir());
- assert(dir->is_complete());
-
- if (!hash_gather[dir].empty()) return; // already been here..freeze must have been instantaneous
-
- // send unhash prep to all peers
- assert(hash_gather[dir].empty());
- for (int i=0; i<mds->get_mds_map()->get_num_mds(); i++) {
- if (i == mds->get_nodeid()) continue;
- hash_gather[dir].insert(i);
- mds->send_message_mds(new MUnhashDirPrep(dir->ino()),
- i, MDS_PORT_MIGRATOR);
- }
-}
-
-/*
- * wait for peers to freeze and complete hashed dirs
- */
-void Migrator::handle_unhash_dir_prep_ack(MUnhashDirPrepAck *m)
-{
- CInode *in = cache->get_inode(m->get_ino());
- assert(in);
- CDir *dir = in->dir;
- assert(dir);
-
- int from = m->get_source().num();
- dout(7) << "handle_unhash_dir_prep_ack from " << from << " " << *dir << endl;
-
- if (!m->did_assim()) {
- m->mark_assim(); // only do this the first time!
-
- // assimilate dentry+inodes for exports
- for (map<string,CInodeDiscover*>::iterator it = m->get_inodes().begin();
- it != m->get_inodes().end();
- it++) {
- CInode *in = cache->get_inode( it->second->get_ino() );
- if (in) {
- it->second->update_inode(in);
- dout(5) << " updated " << *in << endl;
- } else {
- in = new CInode(mds->mdcache, false);
- it->second->update_inode(in);
- cache->add_inode(in);
-
- // link
- dir->add_dentry( it->first, in );
- dout(5) << " added " << *in << endl;
- }
-
- // open!
- if (!in->dir) {
- dout(5) << " opening nested export on " << *in << endl;
- cache->open_remote_dir(in,
- new C_MDS_RetryMessage(mds, m));
- }
- }
- }
-
- // verify!
- int waiting_for = 0;
- for (map<string,CInodeDiscover*>::iterator it = m->get_inodes().begin();
- it != m->get_inodes().end();
- it++) {
- CInode *in = cache->get_inode( it->second->get_ino() );
- assert(in);
-
- if (in->dir) {
- if (!in->dir->state_test(CDIR_STATE_IMPORTBOUND)) {
- dout(5) << " pinning nested export " << *in->dir << endl;
- in->dir->get(CDir::PIN_IMPORTBOUND);
- in->dir->state_set(CDIR_STATE_IMPORTBOUND);
- } else {
- dout(5) << " already pinned nested export " << *in << endl;
- }
- } else {
- dout(5) << " waiting for nested export dir on " << *in << endl;
- waiting_for++;
- }
- }
-
- if (waiting_for) {
- dout(5) << "waiting for " << waiting_for << " dirs to open" << endl;
- return;
- }
-
- // ok, done with this PrepAck
- assert(hash_gather[dir].count(from) == 1);
- hash_gather[dir].erase(from);
-
- if (hash_gather[dir].empty()) {
- hash_gather.erase(dir);
- dout(7) << "handle_unhash_dir_prep_ack on " << *dir << ", last one" << endl;
- unhash_dir_go(dir);
- } else {
- dout(7) << "handle_unhash_dir_prep_ack on " << *dir << ", waiting for " << hash_gather[dir] << endl;
- }
-
- delete m;
-}
-
-
-/*
- * auth:
- * send out MHashDir's to peers
- */
-void Migrator::unhash_dir_go(CDir *dir)
-{
- dout(7) << "unhash_dir_go " << *dir << endl;
- assert(dir->is_hashed());
- assert(dir->is_auth());
- assert(dir->is_frozen_dir());
- assert(dir->is_complete());
-
- // send unhash prep to all peers
- assert(hash_gather[dir].empty());
- for (int i=0; i<mds->get_mds_map()->get_num_mds(); i++) {
- if (i == mds->get_nodeid()) continue;
- hash_gather[dir].insert(i);
- mds->send_message_mds(new MUnhashDir(dir->ino()),
- i, MDS_PORT_MIGRATOR);
- }
-}
-
-/*
- * auth:
- * assimilate unhashing content
- */
-void Migrator::handle_unhash_dir_ack(MUnhashDirAck *m)
-{
- CInode *in = cache->get_inode(m->get_ino());
- assert(in);
- CDir *dir = in->dir;
- assert(dir);
-
- dout(7) << "handle_unhash_dir_ack " << *dir << endl;
- assert(dir->is_hashed());
-
- // assimilate content
- int from = m->get_source().num();
- import_hashed_content(dir, m->get_state(), m->get_nden(), from);
- delete m;
-
- // done?
- assert(hash_gather[dir].count(from));
- hash_gather[dir].erase(from);
-
- if (!hash_gather[dir].empty()) {
- dout(7) << "still waiting for unhash acks from " << hash_gather[dir] << endl;
- return;
- }
-
- // done!
-
- // fix up nested_exports
- CDir *containing_import = cache->get_auth_container(dir);
- if (containing_import != dir) {
- for (set<CDir*>::iterator it = cache->nested_exports[dir].begin();
- it != cache->nested_exports[dir].end();
- it++) {
- dout(7) << "moving nested export out from under hashed dir : " << **it << endl;
- cache->nested_exports[containing_import].insert(*it);
- }
- cache->nested_exports.erase(dir);
- }
-
- // dir state
- //dir->state_clear(CDIR_STATE_UNHASHING); //later
- dir->state_clear(CDIR_STATE_HASHED);
- dir->put(CDir::PIN_HASHED);
- cache->hashdirs.erase(dir);
-
- // commit!
- assert(dir->is_complete());
- //dir->mark_complete();
- dir->mark_dirty(dir->pre_dirty()); // fixme
- mds->mdstore->commit_dir(dir, 0);
-
- // inode state
- dir->inode->inode.hash_seed = 0;
- if (dir->inode->is_auth()) {
- dir->inode->_mark_dirty(); // fixme
- mds->mdlog->submit_entry(new EString("hash inode dirty fixme"));
- }
-
- // notify
- assert(hash_gather[dir].empty());
- for (int i=0; i<mds->get_mds_map()->get_num_mds(); i++) {
- if (i == mds->get_nodeid()) continue;
-
- hash_gather[dir].insert(i);
-
- mds->send_message_mds(new MUnhashDirNotify(dir->ino()),
- i, MDS_PORT_MIGRATOR);
- }
-}
-
-
-/*
- * sent by peer to flush mds links. unfreeze when all gathered.
- */
-void Migrator::handle_unhash_dir_notify_ack(MUnhashDirNotifyAck *m)
-{
- CInode *in = cache->get_inode(m->get_ino());
- assert(in);
- CDir *dir = in->dir;
- assert(dir);
-
- dout(7) << "handle_unhash_dir_ack " << *dir << endl;
- assert(!dir->is_hashed());
- assert(dir->is_unhashing());
- assert(dir->is_frozen_dir());
-
- // done?
- int from = m->get_source().num();
- assert(hash_gather[dir].count(from));
- hash_gather[dir].erase(from);
- delete m;
-
- if (!hash_gather[dir].empty()) {
- dout(7) << "still waiting for notifyack from " << hash_gather[dir] << " on " << *dir << endl;
- } else {
- unhash_dir_finish(dir);
- }
-}
-
-
-/*
- * all mds links are flushed. unfreeze dir!
- */
-void Migrator::unhash_dir_finish(CDir *dir)
-{
- dout(7) << "unhash_dir_finish " << *dir << endl;
- hash_gather.erase(dir);
-
- // unpin path
- vector<CDentry*> trace;
- cache->make_trace(trace, dir->inode);
- cache->path_unpin(trace, 0);
-
- // state
- dir->state_clear(CDIR_STATE_UNHASHING);
-
- // unfreeze
- dir->unfreeze_dir();
-
-}
-
-
-
-// UNHASH on all
-
-/*
- * hashed dir is complete.
- * mark all migrating inodes dirty (to pin in cache)
- * if frozen too, then go to next step (depending on auth)
- */
-void Migrator::unhash_dir_complete(CDir *dir)
-{
- dout(7) << "unhash_dir_complete " << *dir << ", dirtying inodes" << endl;
-
- assert(dir->is_hashed());
- assert(dir->is_complete());
-
- // mark dirty to pin in cache
- for (CDir_map_t::iterator it = dir->begin();
- it != dir->end();
- it++) {
- CInode *in = it->second->inode;
- if (in->is_auth()) {
- in->_mark_dirty(); // fixme
- mds->mdlog->submit_entry(new EString("unhash dirty fixme"));
- }
- }
-
- if (!dir->is_frozen_dir()) {
- dout(7) << "dir complete but !frozen, waiting " << *dir << endl;
- } else {
- if (dir->is_auth())
- unhash_dir_prep(dir); // auth
- else
- unhash_dir_prep_finish(dir); // nonauth
- }
-}
-
-
-// UNHASH on non-auth
-
-class C_MDC_UnhashPrepFreeze : public Context {
-public:
- Migrator *mig;
- CDir *dir;
- C_MDC_UnhashPrepFreeze(Migrator *m, CDir *d) : mig(m), dir(d) {}
- virtual void finish(int r) {
- mig->unhash_dir_prep_frozen(dir);
- }
-};
-
-
-/*
- * peers need to freeze their dir and make them complete
- */
-void Migrator::handle_unhash_dir_prep(MUnhashDirPrep *m)
-{
- CInode *in = cache->get_inode(m->get_ino());
- assert(in);
- CDir *dir = in->dir;
- assert(dir);
-
- dout(7) << "handle_unhash_dir_prep " << *dir << endl;
- assert(dir->is_hashed());
-
- // freeze
- dir->freeze_dir(new C_MDC_UnhashPrepFreeze(this, dir));
-
- // make complete
- if (!dir->is_complete()) {
- dout(7) << "unhash_dir " << *dir << " not complete, fetching" << endl;
- mds->mdstore->fetch_dir(dir,
- new C_MDC_UnhashComplete(this, dir));
- } else {
- unhash_dir_complete(dir);
- }
-
- delete m;
-}
-
-/*
- * peer has hashed dir frozen.
- * complete too?
- */
-void Migrator::unhash_dir_prep_frozen(CDir *dir)
-{
- dout(7) << "unhash_dir_prep_frozen " << *dir << endl;
-
- assert(dir->is_hashed());
- assert(dir->is_frozen_dir());
- assert(!dir->is_auth());
-
- if (!dir->is_complete()) {
- dout(7) << "unhash_dir_prep_frozen !complete, waiting still on " << *dir << endl;
- } else
- unhash_dir_prep_finish(dir);
-}
-
-/*
- * peer has hashed dir complete and frozen. ack.
- */
-void Migrator::unhash_dir_prep_finish(CDir *dir)
-{
- dout(7) << "unhash_dir_prep_finish " << *dir << endl;
- assert(dir->is_hashed());
- assert(!dir->is_auth());
- assert(dir->is_frozen());
- assert(dir->is_complete());
-
- // twiddle state
- if (dir->is_unhashing())
- return; // already replied.
- dir->state_set(CDIR_STATE_UNHASHING);
-
- // send subdirs back to auth
- MUnhashDirPrepAck *ack = new MUnhashDirPrepAck(dir->ino());
- int auth = dir->authority();
-
- for (CDir_map_t::iterator it = dir->begin();
- it != dir->end();
- it++) {
- CDentry *dn = it->second;
- CInode *in = dn->inode;
-
- if (!in->is_dir()) continue;
- if (!in->dir) continue;
-
- int dentryhashcode = mds->mdcache->hash_dentry( dir->ino(), it->first );
- if (dentryhashcode != mds->get_nodeid()) continue;
-
- // msg?
- ack->add_inode(it->first, in->replicate_to(auth));
- }
-
- // ack
- mds->send_message_mds(ack, auth, MDS_PORT_MIGRATOR);
-}
-
-
-
-/*
- * peer needs to send hashed dir content back to auth.
- * unhash dir.
- */
-void Migrator::handle_unhash_dir(MUnhashDir *m)
-{
- CInode *in = cache->get_inode(m->get_ino());
- assert(in);
- CDir *dir = in->dir;
- assert(dir);
-
- dout(7) << "handle_unhash_dir " << *dir << endl;//" .. hash_seed is " << dir->inode->inode.hash_seed << endl;
- assert(dir->is_hashed());
- assert(dir->is_unhashing());
- assert(!dir->is_auth());
-
- // get message ready
- bufferlist bl;
- int nden = 0;
-
- // suck up all waiters
- C_Contexts *fin = new C_Contexts;
- list<Context*> waiting;
- dir->take_waiting(CDIR_WAIT_ANY, waiting); // all dir waiters
- fin->take(waiting);
-
- // divy up contents
- for (CDir_map_t::iterator it = dir->begin();
- it != dir->end();
- it++) {
- CDentry *dn = it->second;
- CInode *in = dn->inode;
-
- int dentryhashcode = mds->mdcache->hash_dentry( dir->ino(), it->first );
- if (dentryhashcode != mds->get_nodeid()) {
- // not mine!
- // twiddle dir_auth?
- if (in->dir) {
- if (in->dir->authority() != dir->authority())
- in->dir->set_dir_auth( in->dir->authority() );
- else
- in->dir->set_dir_auth( CDIR_AUTH_PARENT );
- }
- continue;
- }
-
- // -- dentry
- dout(7) << "unhash_dir_go sending to " << dentryhashcode << " dn " << *dn << endl;
- _encode(it->first, bl);
-
- // null dentry?
- if (dn->is_null()) {
- bl.append("N", 1); // null dentry
- assert(dn->is_sync());
- continue;
- }
-
- if (dn->is_remote()) {
- // remote link
- bl.append("L", 1); // remote link
-
- inodeno_t ino = dn->get_remote_ino();
- bl.append((char*)&ino, sizeof(ino));
- continue;
- }
-
- // primary link
- // -- inode
- bl.append("I", 1); // inode dentry
-
- encode_export_inode(in, bl, dentryhashcode); // encode, and (update state for) export
- nden++;
-
- if (dn->is_dirty())
- dn->mark_clean();
-
- // proxy
- in->state_set(CInode::STATE_PROXY);
- in->get(CInode::PIN_PROXY);
- hash_proxy_inos[dir].push_back(in);
-
- if (in->dir) {
- if (in->dir->is_auth()) {
- // mine. make it into an import.
- dout(7) << "making subdir into import " << *in->dir << endl;
- in->dir->set_dir_auth( mds->get_nodeid() );
- cache->imports.insert(in->dir);
- in->dir->get(CDir::PIN_IMPORT);
- in->dir->state_set(CDIR_STATE_IMPORT);
- }
- else {
- // not mine.
- dout(7) << "un-exporting subdir that's being unhashed away " << *in->dir << endl;
- assert(in->dir->is_export());
- in->dir->put(CDir::PIN_EXPORT);
- in->dir->state_clear(CDIR_STATE_EXPORT);
- cache->exports.erase(in->dir);
- cache->nested_exports[dir].erase(in->dir);
- }
- }
-
- // waiters
- list<Context*> waiters;
- in->take_waiting(CINODE_WAIT_ANY, waiters);
- fin->take(waiters);
- }
-
- // we should have no nested exports; we're not auth for the dir!
- assert(cache->nested_exports[dir].empty());
- cache->nested_exports.erase(dir);
-
- // dir state
- //dir->state_clear(CDIR_STATE_UNHASHING); // later
- dir->state_clear(CDIR_STATE_HASHED);
- dir->put(CDir::PIN_HASHED);
- cache->hashdirs.erase(dir);
- dir->mark_clean();
-
- // inode state
- dir->inode->inode.hash_seed = 0;
- if (dir->inode->is_auth()) {
- dir->inode->_mark_dirty(); // fixme
- mds->mdlog->submit_entry(new EString("unhash inode dirty fixme"));
- }
-
- // init gather set
- mds->get_mds_map()->get_active_mds_set( hash_gather[dir] );
- hash_gather[dir].erase(mds->get_nodeid());
-
- // send unhash message
- mds->send_message_mds(new MUnhashDirAck(dir->ino(), bl, nden),
- dir->authority(), MDS_PORT_MIGRATOR);
-}
-
-
-/*
- * first notify comes from auth.
- * send notifies to all other peers, with peer = self
- * if we get notify from peer=other, remove from our gather list.
- * when we've gotten notifies from everyone,
- * unpin proxies,
- * send notify_ack to auth.
- * this ensures that all mds links are flushed of cache_expire type messages.
- */
-void Migrator::handle_unhash_dir_notify(MUnhashDirNotify *m)
-{
- CInode *in = cache->get_inode(m->get_ino());
- assert(in);
- CDir *dir = in->dir;
- assert(dir);
-
- dout(7) << "handle_unhash_dir_finish " << *dir << endl;
- assert(!dir->is_hashed());
- assert(dir->is_unhashing());
- assert(!dir->is_auth());
-
- int from = m->get_source().num();
- assert(hash_gather[dir].count(from) == 1);
- hash_gather[dir].erase(from);
- delete m;
-
- // did we send our shout out?
- if (from == dir->authority()) {
- // send notify to everyone else in weird chatter storm
- for (int i=0; i<mds->get_mds_map()->get_num_mds(); i++) {
- if (i == from) continue;
- if (i == mds->get_nodeid()) continue;
- mds->send_message_mds(new MUnhashDirNotify(dir->ino()), i, MDS_PORT_MIGRATOR);
- }
- }
-
- // are we done?
- if (!hash_gather[dir].empty()) {
- dout(7) << "still waiting for notify from " << hash_gather[dir] << endl;
- return;
- }
- hash_gather.erase(dir);
-
- // all done!
- dout(7) << "all mds links flushed, unpinning unhash proxies" << endl;
-
- // unpin proxies
- for (list<CInode*>::iterator it = hash_proxy_inos[dir].begin();
- it != hash_proxy_inos[dir].end();
- it++) {
- CInode *in = *it;
- assert(in->state_test(CInode::STATE_PROXY));
- in->state_clear(CInode::STATE_PROXY);
- in->put(CInode::PIN_PROXY);
- }
-
- // unfreeze
- dir->unfreeze_dir();
-
- // ack
- dout(7) << "sending notify_ack to auth for unhash of " << *dir << endl;
- mds->send_message_mds(new MUnhashDirNotifyAck(dir->ino()), dir->authority(), MDS_PORT_MIGRATOR);
-
-}
// use generic range. FIXME THIS IS CRAP
free.clear();
+#ifdef __LP64__
uint64_t start = (uint64_t)(mds->get_nodeid()+1) << 40;
uint64_t end = ((uint64_t)(mds->get_nodeid()+2) << 40) - 1;
+#else
+# warning this looks like a 32-bit system, using small inode numbers.
+ uint64_t start = (uint64_t)(mds->get_nodeid()+1) << 25;
+ uint64_t end = ((uint64_t)(mds->get_nodeid()+2) << 25) - 1;
+#endif
free.insert(start, end);
state = STATE_ACTIVE;
switch (lock->get_type()) {
case LOCK_OTYPE_IFILE:
return file_rdlock_start((FileLock*)lock, mdr);
+ case LOCK_OTYPE_IDIRFRAGTREE:
case LOCK_OTYPE_IDIR:
return scatter_rdlock_start((ScatterLock*)lock, mdr);
default:
switch (lock->get_type()) {
case LOCK_OTYPE_IFILE:
return file_rdlock_finish((FileLock*)lock, mdr);
+ case LOCK_OTYPE_IDIRFRAGTREE:
case LOCK_OTYPE_IDIR:
return scatter_rdlock_finish((ScatterLock*)lock, mdr);
default:
bool Locker::wrlock_start(SimpleLock *lock, MDRequest *mdr)
{
switch (lock->get_type()) {
+ case LOCK_OTYPE_IDIRFRAGTREE:
case LOCK_OTYPE_IDIR:
return scatter_wrlock_start((ScatterLock*)lock, mdr);
case LOCK_OTYPE_IVERSION:
return local_wrlock_start((LocalLock*)lock, mdr);
default:
- assert(0);
+ assert(0);
+ return false;
}
}
void Locker::wrlock_finish(SimpleLock *lock, MDRequest *mdr)
{
switch (lock->get_type()) {
+ case LOCK_OTYPE_IDIRFRAGTREE:
case LOCK_OTYPE_IDIR:
return scatter_wrlock_finish((ScatterLock*)lock, mdr);
case LOCK_OTYPE_IVERSION:
return file_xlock_start((FileLock*)lock, mdr);
case LOCK_OTYPE_IVERSION:
return local_xlock_start((LocalLock*)lock, mdr);
+ case LOCK_OTYPE_IDIRFRAGTREE:
case LOCK_OTYPE_IDIR:
assert(0);
default:
return file_xlock_finish((FileLock*)lock, mdr);
case LOCK_OTYPE_IVERSION:
return local_xlock_finish((LocalLock*)lock, mdr);
+ case LOCK_OTYPE_IDIRFRAGTREE:
case LOCK_OTYPE_IDIR:
assert(0);
default:
cap->set_suppress(true);
} else {
// make sure it has sufficient caps
- if (cap->wanted() & ~my_want) {
+ if (my_want & ~cap->wanted()) {
// augment wanted caps for this client
cap->set_wanted( cap->wanted() | my_want );
}
<< endl;
// update wanted
- if (cap->wanted() != wanted)
- cap->set_wanted(wanted);
+ if (cap->wanted() != wanted) {
+ if (m->get_seq() < cap->get_last_seq()) {
+ /* this is awkward.
+ client may be trying to release caps (i.e. inode closed, etc.) by setting reducing wanted
+ set.
+ but it may also be opening the same filename, not sure that it'll map to the same inode.
+ so, we don't want wanted reductions to clobber mds's notion of wanted unless we're
+ sure the client has seen all the latest caps.
+ */
+ dout(-10) << "handle_client_file_caps ignoring wanted " << cap_string(m->get_wanted())
+ << " bc seq " << m->get_seq() << " < " << cap->get_last_seq() << endl;
+ } else {
+ cap->set_wanted(wanted);
+ }
+ }
// confirm caps
int had = cap->confirm_receipt(m->get_seq(), m->get_caps());
case LOCK_OTYPE_DN:
case LOCK_OTYPE_IAUTH:
case LOCK_OTYPE_ILINK:
- case LOCK_OTYPE_IDIRFRAGTREE:
handle_simple_lock(lock, m);
break;
handle_file_lock((FileLock*)lock, m);
break;
+ case LOCK_OTYPE_IDIRFRAGTREE:
case LOCK_OTYPE_IDIR:
handle_scatter_lock((ScatterLock*)lock, m);
break;
!lock->is_rdlocked() &&
!lock->is_xlocked() &&
lock->get_state() == LOCK_SYNC)
- scatter_lock(lock);
+ lock->set_state(LOCK_SCATTER);
+ //scatter_scatter(lock);
// can wrlock?
if (lock->can_wrlock()) {
}
// wait for write.
- lock->add_waiter(SimpleLock::WAIT_WR|SimpleLock::WAIT_STABLE, new C_MDS_RetryRequest(mdcache, mdr));
+ lock->add_waiter(SimpleLock::WAIT_WR|SimpleLock::WAIT_STABLE,
+ new C_MDS_RetryRequest(mdcache, mdr));
// initiate scatter or lock?
if (lock->is_stable()) {
inode_t *pi = in->project_inode();
pi->version = in->pre_dirty();
- EUpdate *le = new EUpdate("dir.mtime writebehind");
+ EUpdate *le = new EUpdate(mds->mdlog, "scatter writebehind");
le->metablob.add_dir_context(in->get_parent_dn()->get_dir());
le->metablob.add_primary_dentry(in->get_parent_dn(), true, 0, pi);
private:
int _type;
off_t _start_off,_end_off;
+
friend class MDLog;
public:
- LogEvent(int t) : _type(t), _start_off(0), _end_off(0) { }
+ LogEvent(int t) :
+ _type(t), _start_off(0), _end_off(0) { }
virtual ~LogEvent() { }
int get_type() { return _type; }
out << "event(" << _type << ")";
}
-
/*** live journal ***/
/* obsolete() - is this entry committed to primary store, such that
utime_t elapsed = now;
elapsed -= first;
+ // sample?
+ if ((double)now - (double)last_sample > g_conf.mds_bal_sample_interval) {
+ dout(15) << "tick last_sample now " << now << endl;
+ last_sample = now;
+ }
+
// balance?
if (true &&
mds->get_nodeid() == 0 &&
// hash?
if (true &&
- g_conf.num_mds > 1 &&
- now.sec() - last_hash.sec() > g_conf.mds_bal_hash_interval) {
- last_hash = now;
- do_hashing();
+ now.sec() - last_fragment.sec() > g_conf.mds_bal_fragment_interval) {
+ last_fragment = now;
+ do_fragmenting();
}
}
mds_load_t MDBalancer::get_load()
{
mds_load_t load;
- if (mds->mdcache->get_root())
- load.root =
- mds->mdcache->get_root()->popularity[MDS_POP_ANYDOM];
- // +
- // mds->mdcache->get_root()->popularity[MDS_POP_NESTED];
+
+ if (mds->mdcache->get_root()) {
+ list<CDir*> ls;
+ mds->mdcache->get_root()->get_dirfrags(ls);
+ for (list<CDir*>::iterator p = ls.begin();
+ p != ls.end();
+ p++) {
+ load.auth += (*p)->pop_auth_subtree_nested;
+ load.all += (*p)->pop_nested;
+ }
+ }
load.req_rate = mds->get_req_rate();
load.queue_len = mds->messenger->get_dispatch_queue_len();
void MDBalancer::send_heartbeat()
{
+ utime_t now = g_clock.now();
if (!mds->mdcache->get_root()) {
dout(5) << "no root on send_heartbeat" << endl;
mds->mdcache->open_root(new C_Bal_SendHeartbeat(mds));
int from = im->inode->authority().first;
if (from == mds->get_nodeid()) continue;
if (im->get_inode()->is_stray()) continue;
- import_map[from] += im->popularity[MDS_POP_CURDOM].meta_load();
+ import_map[from] += im->pop_auth_subtree.meta_load(now);
}
mds_import_map[ mds->get_nodeid() ] = import_map;
dout(25) << "=== got heartbeat " << m->get_beat() << " from " << m->get_source().num() << " " << m->get_load() << endl;
if (!mds->mdcache->get_root()) {
- dout(10) << "no root on handle" << endl;
+ dout(10) << "opening root on handle_heartbeat" << endl;
mds->mdcache->open_root(new C_MDS_RetryMessage(mds, m));
return;
}
void MDBalancer::export_empties()
{
dout(5) << "export_empties checking for empty imports" << endl;
- dout(0) << "IMPLEMENT ME" << endl;
- /*
- for (set<CDir*>::iterator it = mds->mdcache->subtrees.begin();
+
+ for (map<CDir*,set<CDir*> >::iterator it = mds->mdcache->subtrees.begin();
it != mds->mdcache->subtrees.end();
it++) {
- CDir *dir = *it;
+ CDir *dir = it->first;
+ if (!dir->is_auth() ||
+ dir->is_ambiguous_auth() ||
+ dir->is_freezing() ||
+ dir->is_frozen())
+ continue;
if (!dir->inode->is_root() && dir->get_size() == 0)
mds->mdcache->migrator->export_empty_import(dir);
}
- */
}
-void MDBalancer::do_hashing()
+void MDBalancer::do_fragmenting()
{
- if (hash_queue.empty()) {
- dout(20) << "do_hashing has nothing to do" << endl;
+ if (split_queue.empty()) {
+ dout(20) << "do_fragmenting has nothing to do" << endl;
return;
}
- dout(0) << "do_hashing " << hash_queue.size() << " dirs marked for possible hashing" << endl;
+ dout(0) << "do_fragmenting " << split_queue.size() << " dirs marked for possible splitting" << endl;
- for (set<inodeno_t>::iterator i = hash_queue.begin();
- i != hash_queue.end();
+ for (set<dirfrag_t>::iterator i = split_queue.begin();
+ i != split_queue.end();
i++) {
- inodeno_t dirino = *i;
- CInode *in = mds->mdcache->get_inode(dirino);
- if (!in) continue;
- /*
- CDir *dir = in->dir;
+ CDir *dir = mds->mdcache->get_dirfrag(*i);
if (!dir) continue;
if (!dir->is_auth()) continue;
- dout(0) << "do_hashing hashing " << *dir << endl;
- mds->mdcache->migrator->hash_dir(dir);
- */
+ dout(0) << "do_fragmenting splitting " << *dir << endl;
+ mds->mdcache->split_dir(dir, 3);
}
- hash_queue.clear();
+ split_queue.clear();
}
{
int cluster_size = mds->get_mds_map()->get_num_mds();
int whoami = mds->get_nodeid();
+ utime_t now = g_clock.now();
+
+ dump_pop_map();
// reset
my_targets.clear();
// rescale! turn my mds_load back into meta_load units
double load_fac = 1.0;
- if (mds_load[whoami].mds_load() > 0) {
- load_fac = mds_load[whoami].root.meta_load() / mds_load[whoami].mds_load();
+ if (mds_load[whoami].mds_load(now) > 0) {
+ load_fac = mds_load[whoami].auth.meta_load(now) / mds_load[whoami].mds_load(now);
dout(7) << " load_fac is " << load_fac
- << " <- " << mds_load[whoami].root.meta_load()
- << " / " << mds_load[whoami].mds_load()
- << endl;
+ << " <- " << mds_load[whoami].auth.meta_load(now)
+ << " / " << mds_load[whoami].mds_load(now)
+ << endl;
}
double total_load = 0;
multimap<double,int> load_map;
for (int i=0; i<cluster_size; i++) {
- double l = mds_load[i].mds_load() * load_fac;
+ double l = mds_load[i].mds_load(now) * load_fac;
mds_meta_load[i] = l;
if (whoami == 0)
dout(-5) << " mds" << i
- << " meta load " << mds_load[i]
- << " = " << mds_load[i].mds_load()
- << " --> " << l << endl;
+ << " " << mds_load[i]
+ << " = " << mds_load[i].mds_load(now)
+ << " ~ " << l << endl;
if (whoami == i) my_load = l;
total_load += l;
<< endl;
// under or over?
- if (my_load < target_load) {
- dout(5) << " i am underloaded, doing nothing." << endl;
+ if (my_load < target_load * (1.0 + g_conf.mds_bal_min_rebalance)) {
+ dout(5) << " i am underloaded or barely overloaded, doing nothing." << endl;
show_imports();
return;
}
- dout(5) << " i am overloaded" << endl;
+ dout(5) << " i am sufficiently overloaded" << endl;
// first separate exporters and importers
if (true) {
// analyze import_map; do any matches i can
- dout(5) << " matching exporters to import sources" << endl;
+ dout(15) << " matching exporters to import sources" << endl;
// big -> small exporters
for (multimap<double,int>::reverse_iterator ex = exporters.rbegin();
if (1) {
if (beat % 2 == 1) {
// old way
- dout(5) << " matching big exporters to big importers" << endl;
+ dout(15) << " matching big exporters to big importers" << endl;
// big exporters to big importers
multimap<double,int>::reverse_iterator ex = exporters.rbegin();
multimap<double,int>::iterator im = importers.begin();
}
} else {
// new way
- dout(5) << " matching small exporters to big importers" << endl;
+ dout(15) << " matching small exporters to big importers" << endl;
// small exporters to big importers
multimap<double,int>::iterator ex = exporters.begin();
multimap<double,int>::iterator im = importers.begin();
CDir *im = *it;
if (im->get_inode()->is_stray()) continue;
- double pop = im->popularity[MDS_POP_CURDOM].meta_load();
+ double pop = im->pop_auth_subtree.meta_load(now);
if (pop < g_conf.mds_bal_idle_threshold &&
im->inode != mds->mdcache->get_root() &&
im->inode->authority().first != mds->get_nodeid()) {
mds->mdcache->migrator->export_dir(im, im->inode->authority().first);
continue;
}
+
import_pop_map[ pop ] = im;
int from = im->inode->authority().first;
dout(15) << " map: i imported " << *im << " from " << from << endl;
if (amount < MIN_OFFLOAD) continue;
- dout(-5) << " sending " << amount << " to mds" << target
+ dout(5) << "want to send " << amount << " to mds" << target
//<< " .. " << (*it).second << " * " << load_fac
<< " -> " << amount
<< endl;//" .. fudge is " << fudge << endl;
if (dir->inode->is_root()) continue;
if (dir->is_freezing() || dir->is_frozen()) continue; // export pbly already in progress
- double pop = dir->popularity[MDS_POP_CURDOM].meta_load();
+ double pop = dir->pop_auth_subtree.meta_load(now);
assert(dir->inode->authority().first == target); // cuz that's how i put it in the map, dummy
if (pop <= amount-have) {
pot != candidates.end();
pot++) {
if ((*pot)->get_inode()->is_stray()) continue;
- find_exports(*pot, amount, exports, have, already_exporting);
- if (have > amount-MIN_OFFLOAD) {
+ find_exports(*pot, amount, exports, have, already_exporting, now);
+ if (have > amount-MIN_OFFLOAD)
break;
- }
}
//fudge = amount - have;
total_sent += have;
for (list<CDir*>::iterator it = exports.begin(); it != exports.end(); it++) {
- dout(-5) << " exporting to mds" << target
- << " fragment " << **it
- << " pop " << (*it)->popularity[MDS_POP_CURDOM].meta_load()
+ dout(-5) << " - exporting "
+ << (*it)->pop_auth_subtree.meta_load(now)
+ << " to mds" << target
+ << " " << **it
<< endl;
mds->mdcache->migrator->export_dir(*it, target);
-
- // hack! only do one dir.
- break;
}
}
double amount,
list<CDir*>& exports,
double& have,
- set<CDir*>& already_exporting)
+ set<CDir*>& already_exporting,
+ utime_t now)
{
double need = amount - have;
if (need < amount * g_conf.mds_bal_min_start)
double midchunk = need * g_conf.mds_bal_midchunk;
double minchunk = need * g_conf.mds_bal_minchunk;
- list<CDir*> bigger;
+ list<CDir*> bigger_rep, bigger_unrep;
multimap<double, CDir*> smaller;
- double dir_pop = dir->popularity[MDS_POP_CURDOM].meta_load();
- double dir_sum = 0;
- dout(-7) << " find_exports in " << dir_pop << " " << *dir << " need " << need << " (" << needmin << " - " << needmax << ")" << endl;
+ double dir_pop = dir->pop_auth_subtree.meta_load(now);
+ dout(7) << " find_exports in " << dir_pop << " " << *dir << " need " << need << " (" << needmin << " - " << needmax << ")" << endl;
+ double subdir_sum = 0;
for (CDir_map_t::iterator it = dir->begin();
it != dir->end();
it++) {
for (list<CDir*>::iterator p = dfls.begin();
p != dfls.end();
++p) {
- CDir *dir = *p;
- if (!dir->is_auth()) continue;
- if (already_exporting.count(dir)) continue;
+ CDir *subdir = *p;
+ if (!subdir->is_auth()) continue;
+ if (already_exporting.count(subdir)) continue;
- if (dir->is_frozen()) continue; // can't export this right now!
- //if (in->dir->get_size() == 0) continue; // don't export empty dirs, even if they're not complete. for now!
+ if (subdir->is_frozen()) continue; // can't export this right now!
// how popular?
- double pop = dir->popularity[MDS_POP_CURDOM].meta_load();
- dir_sum += pop;
- dout(20) << " pop " << pop << " " << *dir << endl;
+ double pop = subdir->pop_auth_subtree.meta_load(now);
+ subdir_sum += pop;
+ dout(15) << " subdir pop " << pop << " " << *subdir << endl;
if (pop < minchunk) continue;
// lucky find?
if (pop > needmin && pop < needmax) {
- exports.push_back(dir);
+ exports.push_back(subdir);
+ already_exporting.insert(subdir);
have += pop;
return;
}
- if (pop > need)
- bigger.push_back(dir);
- else
- smaller.insert(pair<double,CDir*>(pop, dir));
+ if (pop > need) {
+ if (subdir->is_rep())
+ bigger_rep.push_back(subdir);
+ else
+ bigger_unrep.push_back(subdir);
+ } else
+ smaller.insert(pair<double,CDir*>(pop, subdir));
}
}
- dout(7) << " .. sum " << dir_sum << " / " << dir_pop << endl;
+ dout(15) << " sum " << subdir_sum << " / " << dir_pop << endl;
// grab some sufficiently big small items
multimap<double,CDir*>::reverse_iterator it;
if ((*it).first < midchunk)
break; // try later
- dout(7) << " taking smaller " << *(*it).second << endl;
+ dout(7) << " taking smaller " << *(*it).second << endl;
exports.push_back((*it).second);
already_exporting.insert((*it).second);
}
// apprently not enough; drill deeper into the hierarchy (if non-replicated)
- for (list<CDir*>::iterator it = bigger.begin();
- it != bigger.end();
+ for (list<CDir*>::iterator it = bigger_unrep.begin();
+ it != bigger_unrep.end();
it++) {
- if ((*it)->is_rep()) continue;
- dout(7) << " descending into " << **it << endl;
- find_exports(*it, amount, exports, have, already_exporting);
+ dout(15) << " descending into " << **it << endl;
+ find_exports(*it, amount, exports, have, already_exporting, now);
if (have > needmin)
return;
}
for (;
it != smaller.rend();
it++) {
-
- dout(7) << " taking (much) smaller " << it->first << " " << *(*it).second << endl;
+ dout(7) << " taking (much) smaller " << it->first << " " << *(*it).second << endl;
exports.push_back((*it).second);
already_exporting.insert((*it).second);
return;
}
- // ok fine, drill inot replicated dirs
- for (list<CDir*>::iterator it = bigger.begin();
- it != bigger.end();
+ // ok fine, drill into replicated dirs
+ for (list<CDir*>::iterator it = bigger_rep.begin();
+ it != bigger_rep.end();
it++) {
- if (!(*it)->is_rep()) continue;
- dout(7) << " descending into replicated " << **it << endl;
- find_exports(*it, amount, exports, have, already_exporting);
+ dout(7) << " descending into replicated " << **it << endl;
+ find_exports(*it, amount, exports, have, already_exporting, now);
if (have > needmin)
return;
}
-void MDBalancer::hit_inode(CInode *in, int type)
+void MDBalancer::hit_inode(utime_t now, CInode *in, int type)
{
+ // hit inode
+ in->pop.get(type).hit(now);
+
+ if (in->get_parent_dir())
+ hit_dir(now, in->get_parent_dir(), type);
+}
+/*
// hit me
- float me = in->popularity[MDS_POP_JUSTME].pop[type].hit();
- float nested = in->popularity[MDS_POP_NESTED].pop[type].hit();
- float curdom = 0;
- float anydom = 0;
+ in->popularity[MDS_POP_JUSTME].pop[type].hit(now);
+ in->popularity[MDS_POP_NESTED].pop[type].hit(now);
if (in->is_auth()) {
- curdom = in->popularity[MDS_POP_CURDOM].pop[type].hit();
- anydom = in->popularity[MDS_POP_ANYDOM].pop[type].hit();
+ in->popularity[MDS_POP_CURDOM].pop[type].hit(now);
+ in->popularity[MDS_POP_ANYDOM].pop[type].hit(now);
+
+ dout(20) << "hit_inode " << type << " pop "
+ << in->popularity[MDS_POP_JUSTME].pop[type].get(now) << " me, "
+ << in->popularity[MDS_POP_NESTED].pop[type].get(now) << " nested, "
+ << in->popularity[MDS_POP_CURDOM].pop[type].get(now) << " curdom, "
+ << in->popularity[MDS_POP_CURDOM].pop[type].get(now) << " anydom"
+ << " on " << *in
+ << endl;
+ } else {
+ dout(20) << "hit_inode " << type << " pop "
+ << in->popularity[MDS_POP_JUSTME].pop[type].get(now) << " me, "
+ << in->popularity[MDS_POP_NESTED].pop[type].get(now) << " nested, "
+ << " on " << *in
+ << endl;
}
- dout(20) << "hit_inode " << type << " pop " << me << " me, "
- << nested << " nested, "
- << curdom << " curdom, "
- << anydom << " anydom"
- << " on " << *in
- << endl;
-
// hit auth up to import
CDir *dir = in->get_parent_dir();
- if (dir) hit_dir(dir, type);
-}
+ if (dir) hit_dir(now, dir, type);
+*/
-void MDBalancer::hit_dir(CDir *dir, int type)
+void MDBalancer::hit_dir(utime_t now, CDir *dir, int type, double amount)
{
// hit me
- float v = dir->popularity[MDS_POP_JUSTME].pop[type].hit();
+ dir->pop_me.get(type).hit(now, amount);
+
+ /*
+ dir->popularity[MDS_POP_JUSTME].pop[type].hit(now, amount);
// hit modify counter, if this was a modify
if (g_conf.num_mds > 2 && // FIXME >2 thing
!dir->inode->is_root() && // not root (for now at least)
dir->is_auth()) {
+ float v = dir->popularity[MDS_POP_JUSTME].pop[type].get();
+
dout(20) << "hit_dir " << type << " pop " << v << " me "
<< *dir << endl;
- // hash this dir? (later?)
- if (((v > g_conf.mds_bal_hash_rd && type == META_POP_IRD) ||
- //(v > g_conf.mds_bal_hash_wr && type == META_POP_IWR) ||
- (v > g_conf.mds_bal_hash_wr && type == META_POP_DWR)) &&
- hash_queue.count(dir->ino()) == 0) {
- dout(0) << "hit_dir " << type << " pop is " << v << ", putting in hash_queue: " << *dir << endl;
- hash_queue.insert(dir->ino());
+ // fragment this dir? (later?)
+ if (((g_conf.mds_bal_split_size > 0 &&
+ dir->get_size() > (unsigned)g_conf.mds_bal_split_size) ||
+ (v > g_conf.mds_bal_split_rd && type == META_POP_IRD) ||
+ //(v > g_conf.mds_bal_split_wr && type == META_POP_IWR) ||
+ (v > g_conf.mds_bal_split_wr && type == META_POP_DWR)) &&
+ split_queue.count(dir->dirfrag()) == 0) {
+ dout(0) << "hit_dir " << type << " pop is " << v << ", putting in split_queue: " << *dir << endl;
+ split_queue.insert(dir->dirfrag());
}
}
+ */
- hit_recursive(dir, type);
-}
-
-
-
-void MDBalancer::hit_recursive(CDir *dir, int type)
-{
- bool anydom = dir->is_auth();
- bool curdom = dir->is_auth();
-
- float rd_adj = 0.0;
-
// replicate?
- float dir_pop = dir->popularity[MDS_POP_CURDOM].pop[type].get(); // hmm??
+ double rd_adj = 0;
+ if (type == META_POP_IRD &&
+ dir->last_popularity_sample < last_sample) {
+ float dir_pop = dir->pop_auth_subtree.get(type).get(now); // hmm??
+ dir->last_popularity_sample = last_sample;
- dout(20) << "hit_recursive " << type << " pop " << dir_pop << " curdom " << *dir << endl;
-
- if (dir->is_auth()) {
- if (!dir->is_rep() &&
- dir_pop >= g_conf.mds_bal_replicate_threshold) {
- // replicate
- float rdp = dir->popularity[MDS_POP_JUSTME].pop[META_POP_IRD].get();
- rd_adj = rdp / mds->get_mds_map()->get_num_mds() - rdp;
- rd_adj /= 2.0; // temper somewhat
-
- dout(2) << "replicating dir " << *dir << " pop " << dir_pop << " .. rdp " << rdp << " adj " << rd_adj << endl;
-
- dir->dir_rep = CDir::REP_ALL;
- mds->mdcache->send_dir_updates(dir, true);
-
- dir->popularity[MDS_POP_JUSTME].pop[META_POP_IRD].adjust(rd_adj);
- dir->popularity[MDS_POP_CURDOM].pop[META_POP_IRD].adjust(rd_adj);
- }
-
- if (!dir->ino() != 1 &&
- dir->is_rep() &&
- dir_pop < g_conf.mds_bal_unreplicate_threshold) {
- // unreplicate
- dout(2) << "unreplicating dir " << *dir << " pop " << dir_pop << endl;
+ dout(20) << "hit_dir " << type << " pop " << dir_pop << " in " << *dir << endl;
+
+ if (dir->is_auth()) {
+ if (!dir->is_rep() &&
+ dir_pop >= g_conf.mds_bal_replicate_threshold) {
+ // replicate
+ float rdp = dir->pop_me.get(META_POP_IRD).get(now);
+ rd_adj = rdp / mds->get_mds_map()->get_num_mds() - rdp;
+ rd_adj /= 2.0; // temper somewhat
+
+ dout(2) << "replicating dir " << *dir << " pop " << dir_pop << " .. rdp " << rdp << " adj " << rd_adj << endl;
+
+ dir->dir_rep = CDir::REP_ALL;
+ mds->mdcache->send_dir_updates(dir, true);
+
+ dir->pop_me.get(META_POP_IRD).adjust(rd_adj);
+ dir->pop_auth_subtree.get(META_POP_IRD).adjust(rd_adj);
+ }
- dir->dir_rep = CDir::REP_NONE;
- mds->mdcache->send_dir_updates(dir);
+ if (!dir->ino() != 1 &&
+ dir->is_rep() &&
+ dir_pop < g_conf.mds_bal_unreplicate_threshold) {
+ // unreplicate
+ dout(2) << "unreplicating dir " << *dir << " pop " << dir_pop << endl;
+
+ dir->dir_rep = CDir::REP_NONE;
+ mds->mdcache->send_dir_updates(dir);
+ }
}
}
+ // adjust ancestors
+ bool hit_subtree = dir->is_auth(); // current auth subtree (if any)
+ bool hit_subtree_nested = dir->is_auth(); // all nested auth subtrees
while (dir) {
- CInode *in = dir->inode;
-
- dir->popularity[MDS_POP_NESTED].pop[type].hit();
- in->popularity[MDS_POP_NESTED].pop[type].hit();
+ dir->pop_nested.get(type).hit(now, amount);
+ if (rd_adj != 0.0)
+ dir->pop_nested.get(META_POP_IRD).adjust(now, rd_adj);
- if (rd_adj != 0.0) dir->popularity[MDS_POP_NESTED].pop[META_POP_IRD].adjust(rd_adj);
-
- if (anydom) {
- dir->popularity[MDS_POP_ANYDOM].pop[type].hit();
- in->popularity[MDS_POP_ANYDOM].pop[type].hit();
- }
-
- if (curdom) {
- dir->popularity[MDS_POP_CURDOM].pop[type].hit();
- in->popularity[MDS_POP_CURDOM].pop[type].hit();
+ if (hit_subtree) {
+ dir->pop_auth_subtree.get(type).hit(now, amount);
+ if (rd_adj != 0.0)
+ dir->pop_auth_subtree.get(META_POP_IRD).adjust(now, rd_adj);
}
+
+ if (hit_subtree_nested) {
+ dir->pop_auth_subtree_nested.get(type).hit(now, amount);
+ if (rd_adj != 0.0)
+ dir->pop_auth_subtree_nested.get(META_POP_IRD).adjust(now, rd_adj);
+ }
if (dir->is_subtree_root())
- curdom = false; // end of auth domain, stop hitting auth counters.
+ hit_subtree = false; // end of auth domain, stop hitting auth counters.
dir = dir->inode->get_parent_dir();
}
}
/*
- * subtract off an exported chunk
+ * subtract off an exported chunk.
+ * this excludes *dir itself (encode_export_dir should have take care of that)
+ * we _just_ do the parents' nested counters.
+ *
+ * NOTE: call me _after_ forcing *dir into a subtree root,
+ * but _before_ doing the encode_export_dirs.
*/
void MDBalancer::subtract_export(CDir *dir)
{
- meta_load_t curdom = dir->popularity[MDS_POP_CURDOM];
-
- bool in_domain = !dir->is_subtree_root();
-
+ dirfrag_load_vec_t subload = dir->pop_auth_subtree;
+
while (true) {
- CInode *in = dir->inode;
-
- in->popularity[MDS_POP_ANYDOM] -= curdom;
- if (in_domain) in->popularity[MDS_POP_CURDOM] -= curdom;
-
- dir = in->get_parent_dir();
+ dir = dir->inode->get_parent_dir();
if (!dir) break;
- if (dir->is_subtree_root()) in_domain = false;
-
- dir->popularity[MDS_POP_ANYDOM] -= curdom;
- if (in_domain) dir->popularity[MDS_POP_CURDOM] -= curdom;
+ dir->pop_nested -= subload;
+ dir->pop_auth_subtree_nested -= subload;
}
}
void MDBalancer::add_import(CDir *dir)
{
- meta_load_t curdom = dir->popularity[MDS_POP_CURDOM];
-
- bool in_domain = !dir->is_subtree_root();
+ dirfrag_load_vec_t subload = dir->pop_auth_subtree;
while (true) {
- CInode *in = dir->inode;
-
- in->popularity[MDS_POP_ANYDOM] += curdom;
- if (in_domain) in->popularity[MDS_POP_CURDOM] += curdom;
-
- dir = in->get_parent_dir();
+ dir = dir->inode->get_parent_dir();
if (!dir) break;
- if (dir->is_subtree_root()) in_domain = false;
-
- dir->popularity[MDS_POP_ANYDOM] += curdom;
- if (in_domain) dir->popularity[MDS_POP_CURDOM] += curdom;
- }
-
+ dir->pop_nested += subload;
+ dir->pop_auth_subtree_nested += subload;
+ }
}
}
+void MDBalancer::dump_pop_map()
+{
+ char fn[20];
+ sprintf(fn, "popdump.%d.mds%d", beat_epoch, mds->get_nodeid());
+
+ dout(1) << "dump_pop_map to " << fn << endl;
+
+ ofstream myfile;
+ myfile.open(fn);
+
+ list<CInode*> iq;
+ if (mds->mdcache->root)
+ iq.push_back(mds->mdcache->root);
+
+ utime_t now = g_clock.now();
+ while (!iq.empty()) {
+ CInode *in = iq.front();
+ iq.pop_front();
+
+ // pop stats
+ /*for (int a=0; a<MDS_NPOP; a++)
+ for (int b=0; b<META_NPOP; b++)
+ myfile << in->popularity[a].pop[b].get(now) << "\t";
+ */
+
+ // recurse, depth-first.
+ if (in->is_dir()) {
+
+ list<CDir*> dirs;
+ in->get_dirfrags(dirs);
+ for (list<CDir*>::iterator p = dirs.begin();
+ p != dirs.end();
+ ++p) {
+ CDir *dir = *p;
+
+ myfile << (int)dir->pop_me.meta_load(now) << "\t";
+ myfile << (int)dir->pop_nested.meta_load(now) << "\t";
+ myfile << (int)dir->pop_auth_subtree.meta_load(now) << "\t";
+ myfile << (int)dir->pop_auth_subtree_nested.meta_load(now) << "\t";
+
+ // filename last
+ string p;
+ in->make_path(p);
+ myfile << "." << p;
+ if (dir->get_frag() != frag_t())
+ myfile << "___" << (unsigned)dir->get_frag();
+ myfile << endl; //"/" << dir->get_frag() << endl;
+
+ // add contents
+ for (map<string,CDentry*>::iterator q = dir->items.begin();
+ q != dir->items.end();
+ q++)
+ if (q->second->is_primary())
+ iq.push_front(q->second->get_inode());
+ }
+ }
+
+ }
+
+ myfile.close();
+}
+
+
/* replicate?
int beat_epoch;
utime_t last_heartbeat;
- utime_t last_hash;
+ utime_t last_fragment;
+ utime_t last_sample;
+
// todo
- set<inodeno_t> hash_queue;
+ set<dirfrag_t> split_queue;
// per-epoch scatter/gathered info
hash_map<int, mds_load_t> mds_load;
void tick();
- void do_hashing();
+ void do_fragmenting();
void export_empties();
void do_rebalance(int beat);
double amount,
list<CDir*>& exports,
double& have,
- set<CDir*>& already_exporting);
+ set<CDir*>& already_exporting,
+ utime_t now);
void subtract_export(class CDir *ex);
void add_import(class CDir *im);
- void hit_inode(class CInode *in, int type=0);
- void hit_dir(class CDir *dir, int type=0);
- void hit_recursive(class CDir *dir, int type=0);
+ void hit_inode(utime_t now, class CInode *in, int type);
+ void hit_dir(utime_t now, class CDir *dir, int type, double amount=1.0);
+ void hit_recursive(utime_t now, class CDir *dir, int type, double amount, double rd_adj);
void show_imports(bool external=false);
+ void dump_pop_map();
};
#include "events/EString.h"
#include "events/EPurgeFinish.h"
#include "events/EImportFinish.h"
+#include "events/EFragment.h"
#include "messages/MGenericMessage.h"
#include "messages/MMDSSlaveRequest.h"
+#include "messages/MMDSFragmentNotify.h"
+
+
#include "IdAllocator.h"
#include "common/Timer.h"
void MDCache::log_stat(Logger *logger)
{
if (get_root()) {
- logger->set("popanyd", (int)get_root()->popularity[MDS_POP_ANYDOM].meta_load());
- logger->set("popnest", (int)get_root()->popularity[MDS_POP_NESTED].meta_load());
+ utime_t now = g_clock.now();
+ //logger->set("pop", (int)get_root()->pop_nested.meta_load(now));
+ //logger->set("popauth", (int)get_root()->pop_auth_subtree_nested.meta_load(now));
}
logger->set("c", lru.lru_get_size());
logger->set("cpin", lru.lru_get_num_pinned());
CDentry *straydn = straydir->lookup(straydname);
if (!straydn)
- straydn = straydir->add_dentry(straydname, 0);
+ straydn = straydir->add_null_dentry(straydname);
return straydn;
}
// i am now the subtree root.
root = dir;
+ // adjust recursive pop counters
+ if (dir->is_auth()) {
+ CDir *p = dir->get_parent_dir();
+ while (p) {
+ p->pop_auth_subtree -= dir->pop_auth_subtree;
+ if (p->is_subtree_root()) break;
+ p = p->inode->get_parent_dir();
+ }
+ }
+
eval_subtree_root(dir);
}
try_subtree_merge_at(*p);
}
+class C_MDC_SubtreeMergeWB : public Context {
+ MDCache *mdcache;
+ CInode *in;
+public:
+ C_MDC_SubtreeMergeWB(MDCache *mdc, CInode *i) : mdcache(mdc), in(i) {}
+ void finish(int r) {
+ mdcache->subtree_merge_writebehind_finish(in);
+ }
+};
+
void MDCache::try_subtree_merge_at(CDir *dir)
{
dout(10) << "try_subtree_merge_at " << *dir << endl;
subtrees.erase(dir);
subtrees[parent].erase(dir);
+ // adjust popularity?
+ if (dir->is_auth()) {
+ CDir *p = dir->get_parent_dir();
+ while (p) {
+ p->pop_auth_subtree += dir->pop_auth_subtree;
+ if (p->is_subtree_root()) break;
+ p = p->inode->get_parent_dir();
+ }
+ }
+
eval_subtree_root(dir);
+
+ // journal inode?
+ // (this is a large hammer to ensure that dirfragtree updates will
+ // hit the disk before the relevant dirfrags ever close)
+ if (dir->inode->is_auth() &&
+ dir->inode->can_auth_pin()) {
+ CInode *in = dir->inode;
+ dout(10) << "try_subtree_merge_at journaling merged bound " << *in << endl;
+
+ in->auth_pin();
+
+ // journal write-behind.
+ inode_t *pi = in->project_inode();
+ pi->version = in->pre_dirty();
+
+ EUpdate *le = new EUpdate(mds->mdlog, "subtree merge writebehind");
+ le->metablob.add_dir_context(in->get_parent_dn()->get_dir());
+ le->metablob.add_primary_dentry(in->get_parent_dn(), true, 0, pi);
+
+ mds->mdlog->submit_entry(le);
+ mds->mdlog->wait_for_sync(new C_MDC_SubtreeMergeWB(this, in));
+ }
}
show_subtrees(15);
}
+void MDCache::subtree_merge_writebehind_finish(CInode *in)
+{
+ dout(10) << "subtree_merge_writebehind_finish on " << in << endl;
+ in->pop_and_dirty_projected_inode();
+ in->auth_unpin();
+}
+
void MDCache::eval_subtree_root(CDir *dir)
{
// evaluate subtree inode dirlock?
mds->locker->scatter_eval(&dir->inode->dirlock);
else
mds->locker->try_scatter_eval(&dir->inode->dirlock); // ** may or may not be auth_pinned **
- }
+ }
+
}
show_subtrees();
}
+
void MDCache::adjust_bounded_subtree_auth(CDir *dir, list<dirfrag_t>& bound_dfs, pair<int,int> auth)
{
dout(7) << "adjust_bounded_subtree_auth " << dir->get_dir_auth() << " -> " << auth
adjust_bounded_subtree_auth(dir, bounds, auth);
}
+void MDCache::map_dirfrag_set(list<dirfrag_t>& dfs, set<CDir*>& result)
+{
+ // group by inode
+ map<inodeno_t, fragset_t> ino_fragset;
+ for (list<dirfrag_t>::iterator p = dfs.begin(); p != dfs.end(); ++p)
+ ino_fragset[p->ino].insert(p->frag);
+
+ // get frags
+ for (map<inodeno_t, fragset_t>::iterator p = ino_fragset.begin();
+ p != ino_fragset.end();
+ ++p) {
+ CInode *in = get_inode(p->first);
+ if (!in) continue;
+
+ list<frag_t> fglist;
+ for (set<frag_t>::iterator q = p->second.begin(); q != p->second.end(); ++q)
+ in->dirfragtree.get_leaves_under(*q, fglist);
+
+ dout(15) << "map_dirfrag_set " << p->second << " -> " << fglist
+ << " on " << *in << endl;
+
+ for (list<frag_t>::iterator q = fglist.begin(); q != fglist.end(); ++q) {
+ CDir *dir = in->get_dirfrag(*q);
+ if (dir) result.insert(dir);
+ }
+ }
+}
+
CDir *MDCache::get_subtree_root(CDir *dir)
<< num_subtrees_fullauth() << " fullauth"
<< endl;
- ESubtreeMap *le = new ESubtreeMap;
+ ESubtreeMap *le = new ESubtreeMap();
// include all auth subtrees, and their bounds.
// and a spanning tree to tie it to the root.
for (map<dirfrag_t, list<dirfrag_t> >::iterator pi = m->subtrees.begin();
pi != m->subtrees.end();
++pi) {
- CDir *im = get_dirfrag(pi->first);
- if (im) {
- adjust_bounded_subtree_auth(im, pi->second, from);
- try_subtree_merge(im);
+ CInode *diri = get_inode(pi->first.ino);
+ if (!diri) continue;
+ bool forced = diri->dirfragtree.force_to_leaf(pi->first.frag);
+ if (forced) {
+ dout(10) << " forced frag " << pi->first.frag << " to leaf in "
+ << diri->dirfragtree
+ << " on " << pi->first << endl;
}
+
+ CDir *dir = diri->get_dirfrag(pi->first.frag);
+ if (!dir) continue;
+
+ adjust_bounded_subtree_auth(dir, pi->second, from);
+ try_subtree_merge(dir);
}
// am i a surviving ambiguous importer?
uncommitted_slave_updates[from][*p].replay(mds);
uncommitted_slave_updates[from].erase(*p);
// log commit
- mds->mdlog->submit_entry(new ESlaveUpdate("unknown", *p, from, ESlaveUpdate::OP_COMMIT));
+ mds->mdlog->submit_entry(new ESlaveUpdate(mds->mdlog, "unknown", *p, from, ESlaveUpdate::OP_COMMIT));
} else {
MDRequest *mdr = request_get(*p);
assert(mdr->slave_request == 0); // shouldn't be doing anything!
if (mds->is_resolve()) {
assert(uncommitted_slave_updates[from].count(*p));
uncommitted_slave_updates[from].erase(*p);
- mds->mdlog->submit_entry(new ESlaveUpdate("unknown", *p, from, ESlaveUpdate::OP_ROLLBACK));
+ mds->mdlog->submit_entry(new ESlaveUpdate(mds->mdlog, "unknown", *p, from, ESlaveUpdate::OP_ROLLBACK));
} else {
MDRequest *mdr = request_get(*p);
if (mdr->slave_commit) {
rejoin->add_weak_primary_dentry(dir->dirfrag(), p->first, dn->get_inode()->ino());
dn->get_inode()->get_nested_dirfrags(nested);
} else if (dn->is_remote())
- rejoin->add_weak_remote_dentry(dir->dirfrag(), p->first, dn->get_remote_ino());
+ rejoin->add_weak_remote_dentry(dir->dirfrag(), p->first,
+ dn->get_remote_ino(), dn->get_remote_d_type());
else
assert(0); // i shouldn't have a non-auth null dentry after replay + trim_non_auth()
}
p != dir->items.end();
++p) {
CDentry *dn = p->second;
- rejoin->add_strong_dentry(dir->dirfrag(), p->first,
+ rejoin->add_strong_dentry(dir->dirfrag(), p->first,
dn->is_primary() ? dn->get_inode()->ino():inodeno_t(0),
dn->is_remote() ? dn->get_remote_ino():inodeno_t(0),
+ dn->is_remote() ? dn->get_remote_d_type():0,
dn->get_replica_nonce(),
dn->lock.get_state());
if (dn->is_primary()) {
int nonce = dn->add_replica(from);
dout(10) << " have " << *dn << endl;
if (ack)
- ack->add_strong_dentry(p->first, q->first,
+ ack->add_strong_dentry(p->first, q->first,
dn->is_primary() ? dn->get_inode()->ino():inodeno_t(0),
dn->is_remote() ? dn->get_remote_ino():inodeno_t(0),
+ dn->is_remote() ? dn->get_remote_d_type():0,
nonce, dn->lock.get_replica_state());
// inode?
CDentry *dn = dir->lookup(q->first);
if (!dn) {
if (q->second.is_remote()) {
- dn = dir->add_dentry(q->first, q->second.remote_ino);
+ dn = dir->add_remote_dentry(q->first, q->second.remote_ino, q->second.remote_d_type);
} else if (q->second.is_null()) {
- dn = dir->add_dentry(q->first);
+ dn = dir->add_null_dentry(q->first);
} else {
CInode *in = get_inode(q->second.ino);
if (!in) in = rejoin_invent_inode(q->second.ino);
- dn = dir->add_dentry(q->first, in);
+ dn = dir->add_primary_dentry(q->first, in);
dout(10) << " missing " << q->second.ino << endl;
if (!missing) missing = new MMDSCacheRejoin(MMDSCacheRejoin::OP_MISSING);
ack[r->first]->add_strong_dentry(dir->dirfrag(), dn->name,
dn->is_primary() ? dn->get_inode()->ino():inodeno_t(0),
dn->is_remote() ? dn->get_remote_ino():inodeno_t(0),
+ dn->is_remote() ? dn->get_remote_d_type():0,
r->second,
dn->lock.get_replica_state());
++p) {
// check container?
if (p->first.ino > 0) {
- CDir *con = get_dirfrag(p->first);
- assert(con); // we had better have this.
+ CInode *coni = get_inode(p->first.ino);
+ assert(coni); // we had better have this.
+ CDir *con = coni->get_approx_dirfrag(p->first.frag);
+ assert(con);
if (!con->is_auth() ||
(con->is_auth() && con->is_exporting() &&
for (map<dirfrag_t, map<string,int> >::iterator pd = p->second.dentries.begin();
pd != p->second.dentries.end();
++pd) {
- dout(0) << " dn expires in dir " << pd->first << endl;
- CDir *dir = get_dirfrag(pd->first);
+ dout(10) << " dn expires in dir " << pd->first << endl;
+ CInode *diri = get_inode(pd->first.ino);
+ assert(diri);
+ CDir *dir = diri->get_dirfrag(pd->first.frag);
if (!dir) {
- dout(0) << " dn expires on " << pd->first << " from " << from << ", don't have it" << endl;
- assert(dir);
- }
- assert(dir->is_auth());
+ dout(0) << " dn expires on " << pd->first << " from " << from << ", must have refragmented" << endl;
+ } else {
+ assert(dir->is_auth());
+ }
for (map<string,int>::iterator p = pd->second.begin();
p != pd->second.end();
++p) {
int nonce = p->second;
+ CDentry *dn;
- CDentry *dn = dir->lookup(p->first);
+ if (dir) {
+ dn = dir->lookup(p->first);
+ } else {
+ // which dirfrag for this dentry?
+ CDir *dir = diri->get_dirfrag(diri->pick_dirfrag(p->first));
+ assert(dir->is_auth());
+ dn = dir->lookup(p->first);
+ }
+
if (!dn)
dout(0) << " missing dentry for " << p->first << " in " << *dir << endl;
assert(dn);
}
// done!
- dout(1) << "shutdown done." << endl;
+ dout(2) << "shutdown done." << endl;
return true;
}
break;
+ case MSG_MDS_FRAGMENTNOTIFY:
+ handle_fragment_notify((MMDSFragmentNotify*)m);
+ break;
// discover?
assert(!cur->is_auth());
if (cur->is_ambiguous_auth()) {
- dout(10) << "traverse: need dir, waiting for single auth on " << *cur << endl;
+ dout(10) << "traverse: need dirfrag " << fg << ", waiting for single auth on " << *cur << endl;
cur->add_waiter(CInode::WAIT_SINGLEAUTH, _get_waiter(mdr, req));
return 1;
} else if (dir_discovers.count(cur->ino())) {
- dout(10) << "traverse: need dir, already doing discover for " << *cur << endl;
+ dout(10) << "traverse: need dirfrag " << fg << ", already doing discover for " << *cur << endl;
assert(cur->is_waiter_for(CInode::WAIT_DIR));
} else {
filepath want = path.postfixpath(depth);
- dout(10) << "traverse: need dir, doing discover, want " << want.get_path()
+ dout(10) << "traverse: need dirfrag " << fg << ", doing discover, want " << want.get_path()
<< " from " << *cur << endl;
mds->send_message_mds(new MDiscover(mds->get_nodeid(),
cur->ino(),
// predirty, prepare log entry
version_t pdv = in->pre_dirty();
- EUpdate *le = new EUpdate("anchor_create");
+ EUpdate *le = new EUpdate(mds->mdlog, "anchor_create");
le->metablob.add_dir_context(in->get_parent_dir());
// update the logged inode copy
// predirty, prepare log entry
version_t pdv = in->pre_dirty();
- EUpdate *le = new EUpdate("anchor_destroy");
+ EUpdate *le = new EUpdate(mds->mdlog, "anchor_destroy");
le->metablob.add_dir_context(in->get_parent_dir());
// update the logged inode copy
// log removal
version_t pdv = dn->pre_dirty();
- EUpdate *le = new EUpdate("purge_stray");
+ EUpdate *le = new EUpdate(mds->mdlog, "purge_stray");
le->metablob.add_dir_context(dn->dir);
le->metablob.add_null_dentry(dn, true);
le->metablob.add_inode_truncate(dn->inode->inode, 0);
}
CDir *curdir = cur->get_dirfrag(fg);
- // am i dir auth (or if no dir, at least the inode auth)
if ((!curdir && !cur->is_auth()) ||
(curdir && !curdir->is_auth())) {
if (curdir) {
- dout(7) << *curdir << " not dirfrag auth, setting dir_auth_hint" << endl;
+ dout(7) << " not dirfrag auth, setting dir_auth_hint for " << *curdir << endl;
reply->set_dir_auth_hint(curdir->authority().first);
} else {
- dout(7) << *cur << " dirfrag not open, not inode auth, setting dir_auth_hint" << endl;
+ dout(7) << " dirfrag not open, not inode auth, setting dir_auth_hint for " << *cur << endl;
reply->set_dir_auth_hint(cur->authority().first);
}
reply->set_wanted_xlocks_hint(dis->wants_xlocked());
// send null dentry
dout(7) << "dentry " << dis->get_dentry(i) << " dne, returning null in "
<< *curdir << endl;
- dn = curdir->add_dentry(dis->get_dentry(i), 0);
+ dn = curdir->add_null_dentry(dis->get_dentry(i));
}
assert(dn);
dis.update_dir(dir);
dout(7) << "add_replica_dir had " << *dir << " nonce " << dir->replica_nonce << endl;
} else {
+ // force frag to leaf in the diri tree
+ if (!diri->dirfragtree.is_leaf(fg)) {
+ dout(7) << "add_replica_dir forcing frag " << fg << " to leaf in the fragtree "
+ << diri->dirfragtree << endl;
+ diri->dirfragtree.force_to_leaf(fg);
+ }
+
// add replica.
dir = diri->add_dirfrag( new CDir(diri, fg, this, false) );
dis.update_dir(dir);
dis.update_dentry(dn);
dout(7) << "add_replica_dentry had " << *dn << endl;
} else {
- dn = dir->add_dentry( dis.get_dname(), 0 );
+ dn = dir->add_null_dentry(dis.get_dname());
dis.update_dentry(dn);
dis.init_dentry_lock(dn);
dout(7) << "add_replica_dentry added " << *dn << endl;
// remote_ino linkage?
if (dis.get_remote_ino()) {
if (dn->is_null())
- dir->link_inode(dn, dis.get_remote_ino());
+ dir->link_remote_inode(dn, dis.get_remote_ino(), dis.get_remote_d_type());
// hrm. yeah.
assert(dn->is_remote() && dn->get_remote_ino() == dis.get_remote_ino());
add_inode(in);
dout(10) << "add_replica_inode had " << *in << endl;
if (dn && dn->is_null())
- dn->dir->link_inode(dn, in);
+ dn->dir->link_primary_inode(dn, in);
} else {
dis.update_inode(in);
dout(10) << "add_replica_inode added " << *in << endl;
CInode *in = dn->inode;
dn->dir->unlink_inode(dn);
assert(straydn);
- straydn->dir->link_inode(straydn, in);
+ straydn->dir->link_primary_inode(straydn, in);
} else {
assert(dn->is_remote());
dn->dir->unlink_inode(dn);
+// ===================================================================
+// FRAGMENT
+
+
+/**
+ * adjust_dir_fragments -- adjust fragmentation for a directory
+ *
+ * @diri - directory inode
+ * @basefrag - base fragment
+ * @bits - bit adjustment. positive for split, negative for merge.
+ */
+void MDCache::adjust_dir_fragments(CInode *diri, frag_t basefrag, int bits,
+ list<CDir*>& resultfrags,
+ list<Context*>& waiters)
+{
+ dout(10) << "adjust_dir_fragments " << basefrag << " " << bits
+ << " on " << *diri << endl;
+
+ // yuck. we may have discovered the inode while it was being fragmented.
+ if (!diri->dirfragtree.is_leaf(basefrag))
+ diri->dirfragtree.force_to_leaf(basefrag);
+
+ CDir *base = diri->get_or_open_dirfrag(this, basefrag);
+
+ // adjust fragtree
+ diri->dirfragtree.split(basefrag, bits);
+ dout(10) << " new fragtree is " << diri->dirfragtree << endl;
+
+ if (bits > 0) {
+ if (base) {
+ CDir *baseparent = base->get_parent_dir();
+
+ base->split(bits, resultfrags, waiters);
+
+ // did i change the subtree map?
+ if (base->is_subtree_root()) {
+ // am i a bound?
+ if (baseparent) {
+ CDir *parent = get_subtree_root(baseparent);
+ assert(subtrees[parent].count(base));
+ subtrees[parent].erase(base);
+ for (list<CDir*>::iterator p = resultfrags.begin();
+ p != resultfrags.end();
+ ++p) {
+ subtrees[parent].insert(*p);
+ subtrees[*p].clear(); // new frag is now its own subtree
+ }
+ }
+
+ // adjust my bounds.
+ set<CDir*> bounds;
+ bounds.swap(subtrees[base]);
+ subtrees.erase(base);
+ for (set<CDir*>::iterator p = bounds.begin();
+ p != bounds.end();
+ ++p) {
+ CDir *frag = get_subtree_root((*p)->get_parent_dir());
+ subtrees[frag].insert(*p);
+ }
+
+ show_subtrees(10);
+ }
+ }
+ } else {
+ assert(base);
+ base->merge(bits, waiters);
+ resultfrags.push_back(base);
+ assert(0); // FIXME adjust subtree map! and clean up this code, probably.
+ }
+}
+
+class C_MDC_FragmentGo : public Context {
+ MDCache *mdcache;
+ CInode *diri;
+ list<CDir*> dirs;
+ frag_t basefrag;
+ int bits;
+public:
+ C_MDC_FragmentGo(MDCache *m, CInode *di, list<CDir*>& dls, frag_t bf, int b) :
+ mdcache(m), diri(di), dirs(dls), basefrag(bf), bits(b) { }
+ virtual void finish(int r) {
+ mdcache->fragment_go(diri, dirs, basefrag, bits);
+ }
+};
+
+void MDCache::split_dir(CDir *dir, int bits)
+{
+ dout(7) << "split_dir " << *dir << " bits " << bits << endl;
+ assert(dir->is_auth());
+
+ if (mds->mdsmap->is_degraded()) {
+ dout(7) << "cluster degraded, no fragmenting for now" << endl;
+ return;
+ }
+ if (dir->inode->is_root()) {
+ dout(7) << "i won't fragment root" << endl;
+ //assert(0);
+ return;
+ }
+ if (dir->state_test(CDir::STATE_FRAGMENTING)) {
+ dout(7) << "already fragmenting" << endl;
+ return;
+ }
+ if (!dir->can_auth_pin()) {
+ dout(7) << "not authpinnable on " << *dir << endl;
+ return;
+ }
+
+ list<CDir*> startfrags;
+ startfrags.push_back(dir);
+
+ dir->state_set(CDir::STATE_FRAGMENTING);
+
+ fragment_freeze(dir->get_inode(), startfrags, dir->get_frag(), bits);
+ fragment_mark_and_complete(dir->get_inode(), startfrags, dir->get_frag(), bits);
+}
+
+/*
+ * initial the freeze, blocking with an auth_pin.
+ *
+ * some reason(s) we have to freeze:
+ * - on merge, version/projected version are unified from all fragments;
+ * concurrent pipelined updates in the directory will have divergent
+ * versioning... and that's no good.
+ */
+void MDCache::fragment_freeze(CInode *diri, list<CDir*>& frags, frag_t basefrag, int bits)
+{
+ C_Gather *gather = new C_Gather(new C_MDC_FragmentGo(this, diri, frags, basefrag, bits));
+
+ // freeze the dirs
+ for (list<CDir*>::iterator p = frags.begin();
+ p != frags.end();
+ ++p) {
+ CDir *dir = *p;
+ dir->auth_pin(); // this will block the freeze
+ dir->freeze_dir(gather->new_sub());
+ }
+}
+
+class C_MDC_FragmentMarking : public Context {
+ MDCache *mdcache;
+ CInode *diri;
+ list<CDir*> dirs;
+ frag_t basefrag;
+ int bits;
+public:
+ C_MDC_FragmentMarking(MDCache *m, CInode *di, list<CDir*>& dls, frag_t bf, int b) :
+ mdcache(m), diri(di), dirs(dls), basefrag(bf), bits(b) { }
+ virtual void finish(int r) {
+ mdcache->fragment_mark_and_complete(diri, dirs, basefrag, bits);
+ }
+};
+
+void MDCache::fragment_mark_and_complete(CInode *diri,
+ list<CDir*>& startfrags,
+ frag_t basefrag, int bits)
+{
+ dout(10) << "fragment_mark_and_complete " << basefrag << " by " << bits
+ << " on " << *diri << endl;
+
+ C_Gather *gather = 0;
+
+ for (list<CDir*>::iterator p = startfrags.begin();
+ p != startfrags.end();
+ ++p) {
+ CDir *dir = *p;
+
+ if (!dir->is_complete()) {
+ dout(15) << " fetching incomplete " << *dir << endl;
+ if (!gather) gather = new C_Gather(new C_MDC_FragmentMarking(this, diri, startfrags, basefrag, bits));
+ dir->fetch(gather->new_sub(),
+ true); // ignore authpinnability
+ }
+ else if (!dir->state_test(CDir::STATE_DNPINNEDFRAG)) {
+ dout(15) << " marking " << *dir << endl;
+ for (map<string,CDentry*>::iterator p = dir->items.begin();
+ p != dir->items.end();
+ ++p) {
+ p->second->get(CDentry::PIN_FRAGMENTING);
+ p->second->state_set(CDentry::STATE_FRAGMENTING);
+ }
+ dir->state_set(CDir::STATE_DNPINNEDFRAG);
+ dir->auth_unpin(); // allow our freeze to complete
+ }
+ else {
+ dout(15) << " marked " << *dir << endl;
+ }
+ }
+}
+
+
+class C_MDC_FragmentStored : public Context {
+ MDCache *mdcache;
+ CInode *diri;
+ frag_t basefrag;
+ int bits;
+ list<CDir*> resultfrags;
+public:
+ C_MDC_FragmentStored(MDCache *m, CInode *di, frag_t bf, int b,
+ list<CDir*>& rf) :
+ mdcache(m), diri(di), basefrag(bf), bits(b), resultfrags(rf) { }
+ virtual void finish(int r) {
+ mdcache->fragment_stored(diri, basefrag, bits, resultfrags);
+ }
+};
+
+void MDCache::fragment_go(CInode *diri, list<CDir*>& startfrags, frag_t basefrag, int bits)
+{
+ dout(10) << "fragment_go " << basefrag << " by " << bits
+ << " on " << *diri << endl;
+
+ // refragment
+ list<CDir*> resultfrags;
+ list<Context*> waiters;
+ adjust_dir_fragments(diri, basefrag, bits, resultfrags, waiters);
+ mds->queue_waiters(waiters);
+
+ C_Gather *gather = new C_Gather(new C_MDC_FragmentStored(this, diri, basefrag, bits, resultfrags));
+
+ // freeze, store resulting frags
+ for (list<CDir*>::iterator p = resultfrags.begin();
+ p != resultfrags.end();
+ p++) {
+ CDir *dir = *p;
+ dout(10) << " result frag " << *dir << endl;
+ dir->state_set(CDir::STATE_FRAGMENTING);
+ dir->commit(0, gather->new_sub());
+ dir->_freeze_dir();
+ }
+}
+
+class C_MDC_FragmentLogged : public Context {
+ MDCache *mdcache;
+ CInode *diri;
+ frag_t basefrag;
+ int bits;
+ list<CDir*> resultfrags;
+ vector<version_t> pvs;
+public:
+ C_MDC_FragmentLogged(MDCache *m, CInode *di, frag_t bf, int b,
+ list<CDir*>& rf, vector<version_t>& p) :
+ mdcache(m), diri(di), basefrag(bf), bits(b) {
+ resultfrags.swap(rf);
+ pvs.swap(p);
+ }
+ virtual void finish(int r) {
+ mdcache->fragment_logged(diri, basefrag, bits,
+ resultfrags, pvs);
+ }
+};
+
+void MDCache::fragment_stored(CInode *diri, frag_t basefrag, int bits,
+ list<CDir*>& resultfrags)
+{
+ dout(10) << "fragment_stored " << basefrag << " by " << bits
+ << " on " << *diri << endl;
+
+ EFragment *le = new EFragment(mds->mdlog, diri->ino(), basefrag, bits);
+
+ set<int> peers;
+ vector<version_t> pvs;
+ for (list<CDir*>::iterator p = resultfrags.begin();
+ p != resultfrags.end();
+ p++) {
+ CDir *dir = *p;
+ dout(10) << " result frag " << *dir << endl;
+
+ if (p == resultfrags.begin()) {
+ le->metablob.add_dir_context(dir);
+ // note peers
+ // only do this once: all frags have identical replica_maps.
+ if (peers.empty())
+ for (map<int,int>::iterator p = dir->replica_map.begin();
+ p != dir->replica_map.end();
+ ++p)
+ peers.insert(p->first);
+ }
+
+ pvs.push_back(dir->pre_dirty());
+ le->metablob.add_dir(dir, true);
+ }
+
+ mds->mdlog->submit_entry(le,
+ new C_MDC_FragmentLogged(this, diri, basefrag, bits,
+ resultfrags, pvs));
+
+ // announcelist<CDir*>& resultfrags,
+ for (set<int>::iterator p = peers.begin();
+ p != peers.end();
+ ++p) {
+ MMDSFragmentNotify *notify = new MMDSFragmentNotify(diri->ino(), basefrag, bits);
+ if (bits < 0) {
+ // freshly replicate basedir to peer on merge
+ CDir *base = resultfrags.front();
+ CDirDiscover *basedis = base->replicate_to(*p);
+ basedis->_encode(notify->basebl);
+ delete basedis;
+ }
+ mds->send_message_mds(notify, *p, MDS_PORT_CACHE);
+ }
+
+}
+
+void MDCache::fragment_logged(CInode *diri, frag_t basefrag, int bits,
+ list<CDir*>& resultfrags,
+ vector<version_t>& pvs)
+{
+ dout(10) << "fragment_logged " << basefrag << " bits " << bits
+ << " on " << *diri << endl;
+
+
+ // dirty resulting frags
+ set<int> peers;
+ vector<version_t>::iterator pv = pvs.begin();
+ for (list<CDir*>::iterator p = resultfrags.begin();
+ p != resultfrags.end();
+ p++) {
+ CDir *dir = *p;
+ dout(10) << " result frag " << *dir << endl;
+
+ // dirty, unpin, unfreeze
+ dir->state_clear(CDir::STATE_FRAGMENTING);
+ dir->mark_dirty(*pv);
+ pv++;
+
+ for (map<string,CDentry*>::iterator p = dir->items.begin();
+ p != dir->items.end();
+ ++p) {
+ CDentry *dn = p->second;
+ if (dn->state_test(CDentry::STATE_FRAGMENTING))
+ dn->put(CDentry::PIN_FRAGMENTING);
+ }
+
+ dir->unfreeze_dir();
+ }
+}
+
+
+
+void MDCache::handle_fragment_notify(MMDSFragmentNotify *notify)
+{
+ dout(10) << "handle_fragment_notify " << *notify << " from " << notify->get_source() << endl;
+
+ CInode *diri = get_inode(notify->get_ino());
+ if (diri) {
+ list<Context*> waiters;
+
+ // add replica dir (for merge)?
+ // (adjust_dir_fragments expects base to already exist, if non-auth)
+ if (notify->get_bits() < 0) {
+ CDirDiscover basedis;
+ int off = 0;
+ basedis._decode(notify->basebl, off);
+ add_replica_dir(diri, notify->get_basefrag(), basedis,
+ notify->get_source().num(), waiters);
+ }
+
+ // refragment
+ list<CDir*> resultfrags;
+ adjust_dir_fragments(diri, notify->get_basefrag(), notify->get_bits(),
+ resultfrags, waiters);
+ mds->queue_waiters(waiters);
+ }
+
+ delete notify;
+}
+
+
class MClientRequest;
class MMDSSlaveRequest;
+class MMDSFragmentNotify;
+
// MDCache
//typedef const char* pchar;
void adjust_bounded_subtree_auth(CDir *dir, list<dirfrag_t>& bounds, int a) {
adjust_bounded_subtree_auth(dir, bounds, pair<int,int>(a, CDIR_AUTH_UNKNOWN));
}
+ void map_dirfrag_set(list<dirfrag_t>& dfs, set<CDir*>& result);
void adjust_export_state(CDir *dir);
void try_subtree_merge(CDir *root);
void try_subtree_merge_at(CDir *root);
+ void subtree_merge_writebehind_finish(CInode *in);
void eval_subtree_root(CDir *dir);
CDir *get_subtree_root(CDir *dir);
void remove_subtree(CDir *dir);
void handle_dentry_unlink(MDentryUnlink *m);
+ // -- fragmenting --
+private:
+ void adjust_dir_fragments(CInode *diri, frag_t basefrag, int bits,
+ list<CDir*>& frags, list<Context*>& waiters);
+ friend class EFragment;
+
+public:
+ void split_dir(CDir *dir, int byn);
+
+private:
+ void fragment_freeze(CInode *diri, list<CDir*>& startfrags, frag_t basefrag, int bits);
+ void fragment_mark_and_complete(CInode *diri, list<CDir*>& startfrags, frag_t basefrag, int bits);
+ void fragment_go(CInode *diri, list<CDir*>& startfrags, frag_t basefrag, int bits);
+ void fragment_stored(CInode *diri, frag_t basefrag, int bits, list<CDir*>& resultfrags);
+ void fragment_logged(CInode *diri, frag_t basefrag, int bits, list<CDir*>& resultfrags, vector<version_t>& pvs);
+ friend class C_MDC_FragmentGo;
+ friend class C_MDC_FragmentMarking;
+ friend class C_MDC_FragmentStored;
+ friend class C_MDC_FragmentLogged;
+
+ void handle_fragment_notify(MMDSFragmentNotify *m);
+
+
// -- updates --
//int send_inode_updates(CInode *in);
//void handle_inode_update(MInodeUpdate *m);
LogType mdlog_logtype;
-/*
-MDLog::MDLog(MDS *m) : replay_thread(this)
-{
- mds = m;
- num_events = 0;
- waiting_for_read = false;
-
- last_import_map = 0;
- writing_import_map = false;
- seen_import_map = false;
-
- max_events = g_conf.mds_log_max_len;
-
- capped = false;
-
- unflushed = 0;
-
- journaler = 0;
- logger = 0;
-}
-*/
-
MDLog::~MDLog()
{
static bool didit = false;
if (!didit) {
+ didit = true;
mdlog_logtype.add_inc("add");
mdlog_logtype.add_inc("expire");
mdlog_logtype.add_inc("obs");
dout(5) << "submit_entry " << journaler->get_write_pos() << " : " << *le << endl;
// encode it, with event type
- bufferlist bl;
- bl.append((char*)&le->_type, sizeof(le->_type));
- le->encode_payload(bl);
-
- // journal it.
- journaler->append_entry(bl);
+ {
+ bufferlist bl;
+ bl.append((char*)&le->_type, sizeof(le->_type));
+ le->encode_payload(bl);
+
+ // journal it.
+ journaler->append_entry(bl); // bl is destroyed.
+ }
assert(!capped);
utime_t stop = g_clock.now();
stop += 2.0;
- while (num_events > max_events &&
- stop > g_clock.now()) {
+ while (num_events > max_events) {
+ // don't check the clock on _every_ event, here!
+ if (num_events % 100 == 0 &&
+ stop < g_clock.now())
+ break;
off_t gap = journaler->get_write_pos() - journaler->get_read_pos();
dout(5) << "trim num_events " << num_events << " > max " << max_events
std::list<Context*> finished;
finished.swap(trim_waiters);
finish_contexts(finished, 0);
-
- // hmm, are we at the end?
- /*
- if (journaler->get_read_pos() == journaler->get_write_pos() &&
- trimming.size() == import_map_expire_waiters.size()) {
- dout(5) << "trim log is empty, allowing import_map to expire" << endl;
- list<Context*> ls;
- ls.swap(import_map_expire_waiters);
- finish_contexts(ls);
- }
- */
}
#include "messages/MClientRequestForward.h"
-LogType mds_logtype, mds_cache_logtype;
-
#include "config.h"
#undef dout
#define dout(l) if (l<=g_conf.debug || l <= g_conf.debug_mds) cout << g_clock.now() << " mds" << whoami << " "
server = new Server(this);
locker = new Locker(this, mdcache);
-
// clients
last_client_mdsmap_bcast = 0;
void MDS::reopen_logger()
{
+ static LogType mds_logtype, mds_cache_logtype;
+ static bool didit = false;
+ if (!didit) {
+ didit = true;
+
+ mds_logtype.add_inc("req");
+ mds_logtype.add_inc("reply");
+ mds_logtype.add_inc("fw");
+ mds_logtype.add_inc("cfw");
+
+ mds_logtype.add_set("l");
+ mds_logtype.add_set("q");
+ mds_logtype.add_set("popanyd");
+ mds_logtype.add_set("popnest");
+
+ mds_logtype.add_inc("lih");
+ mds_logtype.add_inc("lif");
+
+ mds_logtype.add_set("c");
+ mds_logtype.add_set("ctop");
+ mds_logtype.add_set("cbot");
+ mds_logtype.add_set("cptail");
+ mds_logtype.add_set("cpin");
+ mds_logtype.add_inc("cex");
+ mds_logtype.add_inc("dis");
+ mds_logtype.add_inc("cmiss");
+
+ mds_logtype.add_set("buf");
+ mds_logtype.add_inc("cdir");
+ mds_logtype.add_inc("fdir");
+
+ mds_logtype.add_inc("iex");
+ mds_logtype.add_inc("iim");
+ mds_logtype.add_inc("ex");
+ mds_logtype.add_inc("im");
+ mds_logtype.add_inc("imex");
+ mds_logtype.add_set("nex");
+ mds_logtype.add_set("nim");
+ }
+
+ if (whoami < 0) return;
+
// flush+close old log
if (logger) {
logger->flush(true);
delete logger2;
}
-
// log
string name;
name = "mds";
logger = new Logger(name, (LogType*)&mds_logtype);
- mds_logtype.add_inc("req");
- mds_logtype.add_inc("reply");
- mds_logtype.add_inc("fw");
- mds_logtype.add_inc("cfw");
-
- mds_logtype.add_set("l");
- mds_logtype.add_set("q");
- mds_logtype.add_set("popanyd");
- mds_logtype.add_set("popnest");
-
- mds_logtype.add_inc("lih");
- mds_logtype.add_inc("lif");
-
- mds_logtype.add_set("c");
- mds_logtype.add_set("ctop");
- mds_logtype.add_set("cbot");
- mds_logtype.add_set("cptail");
- mds_logtype.add_set("cpin");
- mds_logtype.add_inc("cex");
- mds_logtype.add_inc("dis");
- mds_logtype.add_inc("cmiss");
-
- mds_logtype.add_set("buf");
- mds_logtype.add_inc("cdir");
- mds_logtype.add_inc("fdir");
-
- mds_logtype.add_inc("iex");
- mds_logtype.add_inc("iim");
- mds_logtype.add_inc("ex");
- mds_logtype.add_inc("im");
- mds_logtype.add_inc("imex");
- mds_logtype.add_set("nex");
- mds_logtype.add_set("nim");
-
-
char n[80];
sprintf(n, "mds%d.cache", whoami);
logger2 = new Logger(n, (LogType*)&mds_cache_logtype);
+
+ server->reopen_logger();
}
void MDS::send_message_mds(Message *m, int mds, int port, int fromport)
if (logger) {
req_rate = logger->get("req");
- logger->set("l", (int)load.mds_load());
+ logger->set("l", (int)load.mds_load(g_clock.now()));
logger->set("q", messenger->get_dispatch_queue_len());
logger->set("buf", buffer_total_alloc);
beacon_seq_stamp[beacon_last_seq] = g_clock.now();
int mon = monmap->pick_mon();
- messenger->send_message(new MMDSBeacon(messenger->get_myinst(), want_state, beacon_last_seq),
+ messenger->send_message(new MMDSBeacon(messenger->get_myinst(), mdsmap->get_epoch(),
+ want_state, beacon_last_seq),
monmap->get_inst(mon));
// schedule next sender
dout(0) << "beacon_kill last_acked_stamp " << lab
<< ", killing myself."
<< endl;
- messenger->suicide();
- //exit(0);
+ suicide();
} else {
dout(20) << "beacon_kill last_acked_stamp " << beacon_last_acked_stamp
<< " != my " << lab
whoami = mdsmap->get_addr_rank(messenger->get_myaddr());
if (whoami < 0) {
dout(1) << "handle_mds_map i'm not in the mdsmap, killing myself" << endl;
- shutdown_final();
+ suicide();
return;
}
if (oldwhoami != whoami) {
// contemplate suicide
if (mdsmap->get_inst(whoami) != messenger->get_myinst()) {
dout(1) << "apparently i've been replaced by " << mdsmap->get_inst(whoami) << ", committing suicide." << endl;
- shutdown_final();
+ suicide();
return;
}
if (oldstate == MDSMap::STATE_REJOIN ||
oldstate == MDSMap::STATE_RECONNECT)
recovery_done();
-
- dout(1) << "now active" << endl;
finish_contexts(waiting_for_active); // kick waiters
} else if (is_replay()) {
replay_start();
} else if (is_stopped()) {
assert(oldstate == MDSMap::STATE_STOPPING);
dout(1) << "now stopped, sending down:out and exiting" << endl;
- shutdown_final();
+ suicide();
return;
}
}
-int MDS::shutdown_final()
+void MDS::suicide()
{
- dout(1) << "shutdown_final" << endl;
+ dout(1) << "suicide" << endl;
// flush loggers
if (logger) logger->flush(true);
// shut down messenger
messenger->shutdown();
-
- return 0;
}
} while (dest == whoami);
mdcache->migrator->export_dir(dir,dest);
}
+ // hack: thrash exports
+ for (int i=0; i<g_conf.mds_thrash_fragments; i++) {
+ if (!is_active()) break;
+ dout(7) << "mds thrashing fragments pass " << (i+1) << "/" << g_conf.mds_thrash_fragments << endl;
+
+ // pick a random dir inode
+ CInode *in = mdcache->hack_pick_random_inode();
+ list<CDir*> ls;
+ in->get_dirfrags(ls);
+ if (ls.empty()) continue; // must be an open dir.
+ CDir *dir = ls.front();
+ if (!dir->get_parent_dir()) continue; // must be linked.
+ if (!dir->is_auth()) continue; // must be auth.
+ mdcache->split_dir(dir, 1);// + (rand() % 3));
+ }
// hack: force hash root?
/*
mds_lock.Lock();
dout(10) << "handle_ms_failure to " << inst << " on " << *m << endl;
- if (m->get_type() == MSG_CLIENT_RECONNECT)
+ if (m->get_type() == MSG_MDS_MAP && m->get_dest().is_client())
server->client_reconnect_failure(m->get_dest().num());
delete m;
void shutdown_start();
void stopping_start();
void stopping_done();
- int shutdown_final();
+ void suicide();
void tick();
#include "events/EExport.h"
#include "events/EImportStart.h"
#include "events/EImportFinish.h"
-#include "events/EFragment.h"
#include "msg/Messenger.h"
#include "config.h"
#undef dout
-#define dout(l) if (l<=g_conf.debug || l <= g_conf.debug_mds) cout << g_clock.now() << " mds" << mds->get_nodeid() << ".migrator "
+#define dout(l) if (l<=g_conf.debug || l <= g_conf.debug_mds || l <= g_conf.debug_mds_migrator) cout << g_clock.now() << " mds" << mds->get_nodeid() << ".migrator "
CDir *dir = mds->mdcache->get_dirfrag(df);
if (import_peer[df] == who) {
- switch (import_state[df]) {
+ switch (q->second) {
case IMPORT_DISCOVERING:
dout(10) << "import state=discovering : clearing state" << endl;
import_state.erase(df);
break;
case IMPORT_PREPPING:
- if (import_state[df] == IMPORT_PREPPING) {
+ if (q->second == IMPORT_PREPPING) {
dout(10) << "import state=prepping : unpinning base+bounds " << *dir << endl;
}
assert(dir);
// include the dirfrag? only if it's not the bounding subtree root.
if (cur != bound) {
assert(cur->is_auth());
- prep->add_dirfrag( new CDirDiscover(cur, cur->add_replica(dest)) ); // yay!
+ prep->add_dirfrag( cur->replicate_to(dest) ); // yay!
dout(7) << " added " << *cur << endl;
}
it != inode_trace.end();
it++) {
CInode *in = *it;
+ dout(7) << " added " << *in->parent << endl;
dout(7) << " added " << *in << endl;
prep->add_inode( in->parent->get_dir()->dirfrag(),
- in->parent->get_name(),
+ in->parent->get_name(),
+ in->parent->replicate_to(dest),
in->replicate_to(dest) );
}
// set ambiguous auth
cache->adjust_subtree_auth(dir, dest, mds->get_nodeid());
+
+ // take away the popularity we're sending.
+ mds->balancer->subtract_export(dir);
// fill export message with cache data
+ utime_t now = g_clock.now();
C_Contexts *fin = new C_Contexts; // collect all the waiters
map<int,entity_inst_t> exported_client_map;
int num_exported_inodes = encode_export_dir( export_data[dir],
dir, // base
dir, // recur start point
dest,
- exported_client_map );
+ exported_client_map,
+ now );
bufferlist bl;
::_encode(exported_client_map, bl);
export_data[dir].push_front(bl);
// queue up the finisher
dir->add_waiter( CDir::WAIT_UNFREEZE, fin );
- // take away the popularity we're sending. FIXME: do this later?
- mds->balancer->subtract_export(dir);
-
// stats
if (mds->logger) mds->logger->inc("ex");
if (mds->logger) mds->logger->inc("iex", num_exported_inodes);
* used by: encode_export_dir, file_rename (if foreign)
*/
void Migrator::encode_export_inode(CInode *in, bufferlist& enc_state, int new_auth,
- map<int,entity_inst_t>& exported_client_map)
+ map<int,entity_inst_t>& exported_client_map,
+ utime_t now)
{
// tell (all) clients about migrating caps.. mark STALE
for (map<int, Capability>::iterator it = in->client_caps.begin();
// add inode
assert(!in->is_replica(mds->get_nodeid()));
- CInodeExport istate( in );
+ CInodeExport istate(in, now);
istate._encode( enc_state );
// we're export this inode; fix inode state
CDir *basedir,
CDir *dir,
int newauth,
- map<int,entity_inst_t>& exported_client_map)
+ map<int,entity_inst_t>& exported_client_map,
+ utime_t now)
{
int num_exported = 0;
// dir
bufferlist enc_dir;
- CDirExport dstate(dir);
+ CDirExport dstate(dir, now);
dstate._encode( enc_dir );
// release open_by
dout(7) << "encode_export_dir exporting " << *dn << endl;
// name
- _encode(it->first, enc_dir);
+ ::_encode(it->first, enc_dir);
// state
it->second->encode_export_state(enc_dir);
enc_dir.append("L", 1); // remote link
inodeno_t ino = dn->get_remote_ino();
- enc_dir.append((char*)&ino, sizeof(ino));
+ unsigned char d_type = dn->get_remote_d_type();
+ ::_encode(ino, enc_dir);
+ ::_encode(d_type, enc_dir);
continue;
}
// -- inode
enc_dir.append("I", 1); // inode dentry
- encode_export_inode(in, enc_dir, newauth, exported_client_map); // encode, and (update state for) export
+ encode_export_inode(in, enc_dir, newauth, exported_client_map, now); // encode, and (update state for) export
// directory?
list<CDir*> dfs;
// subdirs
for (list<CDir*>::iterator it = subdirs.begin(); it != subdirs.end(); it++)
num_exported += encode_export_dir(dirstatelist, fin, basedir, *it, newauth,
- exported_client_map);
+ exported_client_map, now);
return num_exported;
}
cache->get_subtree_bounds(dir, bounds);
// log completion
- EExport *le = new EExport(dir);
+ EExport *le = new EExport(mds->mdlog, dir);
+ le->metablob.add_dir_context(dir);
le->metablob.add_dir( dir, false );
for (set<CDir*>::iterator p = bounds.begin();
p != bounds.end();
void Migrator::export_finish(CDir *dir)
{
- dout(7) << "export_finish " << *dir << endl;
+ dout(5) << "export_finish " << *dir << endl;
if (export_state.count(dir) == 0) {
dout(7) << "target must have failed, not sending final commit message. export succeeded anyway." << endl;
dout(7) << "handle_export_discover_2 failed to discover or not dir " << m->get_path() << ", NAK" << endl;
assert(0); // this shouldn't happen if the auth pins his path properly!!!!
}
-
- CInode *in;
- if (trace.empty()) {
- in = cache->get_root();
- if (!in) {
- cache->open_root(new C_MDS_RetryMessage(mds, m));
- return;
- }
- } else {
- in = trace[trace.size()-1]->inode;
- }
+
+ assert(0); // this shouldn't happen; the get_inode above would have succeeded.
}
// yay
-
dout(7) << "handle_export_discover have " << df << " inode " << *in << endl;
+ import_state[m->get_dirfrag()] = IMPORT_DISCOVERED;
+
// pin inode in the cache (for now)
assert(in->is_dir());
in->get(CInode::PIN_IMPORTING);
-
+
// reply
dout(7) << " sending export_discover_ack on " << *in << endl;
mds->send_message_mds(new MExportDirDiscoverAck(df),
// make sure we didn't abort
if (import_state.count(m->get_dirfrag()) == 0 ||
- import_state[m->get_dirfrag()] != IMPORT_DISCOVERED ||
+ (import_state[m->get_dirfrag()] != IMPORT_DISCOVERED &&
+ import_state[m->get_dirfrag()] != IMPORT_PREPPING) ||
import_peer[m->get_dirfrag()] != oldauth) {
dout(10) << "handle_export_prep import has aborted, dropping" << endl;
delete m;
map<inodeno_t, fragset_t> bound_dirfragset;
for (list<dirfrag_t>::iterator p = m->get_bounds().begin();
p != m->get_bounds().end();
- ++p)
+ ++p) {
+ dout(10) << " bound " << *p << endl;
bound_dirfragset[p->ino].insert(p->frag);
+ }
// assimilate contents?
if (!m->did_assim()) {
CDir *condir = cache->get_dirfrag( m->get_containing_dirfrag(in->ino()) );
assert(condir);
cache->add_inode( in );
- condir->add_dentry( m->get_dentry(in->ino()), in );
+ condir->add_primary_dentry( m->get_dentry(in->ino()), in );
dout(7) << " added " << *in << endl;
}
dout(7) << " pinning import bound " << *bound << endl;
bound->get(CDir::PIN_IMPORTBOUND);
bound->state_set(CDir::STATE_IMPORTBOUND);
- import_bounds.insert(bound);
} else {
dout(7) << " already pinned import bound " << *bound << endl;
}
+ import_bounds.insert(bound);
}
}
cache->process_delayed_expire(dir);
// ok now finish contexts
- dout(5) << "finishing any waiters on imported data" << endl;
+ dout(10) << "finishing any waiters on imported data" << endl;
dir->finish_waiting(CDir::WAIT_IMPORTED);
cache->show_subtrees();
// link before state -- or not! -sage
if (dn->inode != in) {
assert(!dn->inode);
- dn->dir->link_inode(dn, in);
+ dn->dir->link_primary_inode(dn, in);
}
// add inode?
if (in->is_replica(mds->get_nodeid()))
in->remove_replica(mds->get_nodeid());
- // twiddle locks
- /*
- if (in->authlock.do_import(oldauth, mds->get_nodeid()))
- mds->locker->simple_eval(&in->authlock);
- if (in->linklock.do_import(oldauth, mds->get_nodeid()))
- mds->locker->simple_eval(&in->linklock);
- if (in->dirfragtreelock.do_import(oldauth, mds->get_nodeid()))
- mds->locker->simple_eval(&in->dirfragtreelock);
- if (in->dirlock.do_import(oldauth, mds->get_nodeid()))
- mds->locker->scatter_eval(&in->dirlock);
- */
-
// caps
for (set<int>::iterator it = merged_client_caps.begin();
it != merged_client_caps.end();
caps->set_mds( oldauth ); // reap from whom?
mds->send_message_client_maybe_open(caps, imported_client_map[*it]);
}
-
- // filelock
- /*
- if (in->filelock.do_import(oldauth, mds->get_nodeid()))
- mds->locker->simple_eval(&in->filelock);
- */
}
long nden = dstate.get_nden();
for (; nden>0; nden--) {
-
num_imported++;
// dentry
string dname;
- _decode(dname, bl, off);
+ ::_decode(dname, bl, off);
CDentry *dn = dir->lookup(dname);
if (!dn)
- dn = dir->add_dentry(dname); // null
+ dn = dir->add_null_dentry(dname);
// decode state
dn->decode_import_state(bl, off, oldauth, mds->get_nodeid());
else if (icode == 'L') {
// remote link
inodeno_t ino;
- bl.copy(off, sizeof(ino), (char*)&ino);
- off += sizeof(ino);
+ unsigned char d_type;
+ ::_decode(ino, bl, off);
+ ::_decode(d_type, bl, off);
if (dn->is_remote()) {
assert(dn->get_remote_ino() == ino);
} else {
- dir->link_inode(dn, ino);
+ dir->link_remote_inode(dn, ino, d_type);
}
}
else if (icode == 'I') {
dout(7) << "handle_export_notify " << old_auth << " -> " << new_auth
<< " on " << *dir << endl;
// adjust auth
- cache->adjust_bounded_subtree_auth(dir, m->get_bounds(), new_auth);
+ set<CDir*> have;
+ cache->map_dirfrag_set(m->get_bounds(), have);
+ cache->adjust_bounded_subtree_auth(dir, have, new_auth);
// induce a merge?
cache->try_subtree_merge(dir);
-
-// ===================================================================
-// FRAGMENT
-
-class C_MDC_FragmentFreeze : public Context {
- Migrator *mig;
- CDir *dir;
- int bits;
-public:
- C_MDC_FragmentFreeze(Migrator *m, CDir *d, int b) : mig(m), dir(d), bits(b) {}
- virtual void finish(int r) {
- if (r >= 0)
- mig->fragment_frozen(dir, bits);
- }
-};
-
-void Migrator::fragment_dir(CDir *dir, int bits)
-{
- dout(7) << "fragment_dir " << *dir << " bits " << bits << endl;
- assert(dir->is_auth());
-
- if (mds->mdsmap->is_degraded()) {
- dout(7) << "cluster degraded, no fragmenting for now" << endl;
- return;
- }
- if (dir->inode->is_root()) {
- dout(7) << "i won't fragment root" << endl;
- //assert(0);
- return;
- }
- if (dir->is_frozen() ||
- dir->is_freezing()) {
- dout(7) << " can't export, freezing|frozen. wait for other exports to finish first." << endl;
- return;
- }
- if (dir->state_test(CDir::STATE_FRAGMENTING)) {
- dout(7) << "already fragmenting" << endl;
- return;
- }
-
- dir->state_set(CDir::STATE_FRAGMENTING);
- dir->get(CDir::PIN_FRAGMENTING);
-
- // first, freeze.
- dir->freeze_dir(new C_MDC_FragmentFreeze(this, dir, bits));
-}
-
-class C_MDC_FragmentLogged : public Context {
- Migrator *mig;
- list<CDir*> dirs;
- int bits;
- version_t pv;
-public:
- C_MDC_FragmentLogged(Migrator *m, list<CDir*>& dls, int b, version_t v) : mig(m), dirs(dls), bits(b), pv(v) {}
- virtual void finish(int r) {
- if (r >= 0)
- mig->fragment_logged(dirs, bits, pv);
- }
-};
-
-void Migrator::fragment_frozen(CDir *dir, int bits)
-{
- dout(7) << "fragment_frozen " << *dir << " bits " << bits << endl;
-
- CInode *diri = dir->get_inode();
-
- // journal it.
- EFragment *le = new EFragment(dir->ino(), dir->get_frag(), bits);
-
- list<CDir*> subfrags;
- list<Context*> waiters;
- version_t pv = dir->pre_dirty();
- diri->fragment_dir(dir->get_frag(), bits, subfrags, waiters);
-
- // predirty and journal content
- for (list<CDir*>::iterator p = subfrags.begin();
- p != subfrags.end();
- ++p) {
- CDir *subfrag = *p;
- le->metablob.add_dir_context(subfrag);
- for (map<string,CDentry*>::iterator q = subfrag->items.begin();
- q != subfrag->items.end();
- ++q) {
- CDentry *dn = q->second;
- dn->set_projected_version(pv);
- le->metablob.add_dentry(dn, true);
- }
- }
-
- // go
- mds->mdlog->submit_entry(le);
- mds->mdlog->wait_for_sync(new C_MDC_FragmentLogged(this, subfrags, bits, pv));
-}
-
-void Migrator::fragment_logged(list<CDir*>& dirs, int bits, version_t pv)
-{
- CInode *diri = dirs.front()->get_inode();
- dout(10) << "fragment_logged " << diri->ino() << " bits " << bits << " pv " << pv << endl;
-
- for (list<CDir*>::iterator p = dirs.begin();
- p != dirs.end();
- p++) {
- CDir *dir = *p;
- dout(10) << " subfrag " << *dir << endl;
-
- // dirty everything
- for (map<string,CDentry*>::iterator p = dir->items.begin();
- p != dir->items.end();
- ++p)
- p->second->mark_dirty(pv);
-
- dir->unfreeze_dir();
- }
-}
class MExportDirNotifyAck;
class MExportDirFinish;
-class MFragmentDirNotify;
-
class EImportStart;
void export_empty_import(CDir *dir);
void encode_export_inode(CInode *in, bufferlist& enc_state, int newauth,
- map<int,entity_inst_t>& exported_client_map);
+ map<int,entity_inst_t>& exported_client_map,
+ utime_t now);
int encode_export_dir(list<bufferlist>& dirstatelist,
class C_Contexts *fin,
CDir *basedir,
CDir *dir,
int newauth,
- map<int,entity_inst_t>& exported_client_map);
+ map<int,entity_inst_t>& exported_client_map,
+ utime_t now);
void add_export_finish_waiter(CDir *dir, Context *c) {
export_finish_waiters[dir].push_back(c);
void handle_export_notify(MExportDirNotify *m);
- // -- fragmenting --
- void fragment_dir(CDir *dir, int byn);
- void fragment_frozen(CDir *dir, int byn);
- friend class C_MDC_FragmentFreeze;
- void fragment_logged(list<CDir*>& dirs, int bits, version_t pv);
- friend class C_MDC_FragmentLogged;
-
- void handle_fragment_notify(MFragmentDirNotify *m);
};
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-#include "MDCache.h"
-#include "CInode.h"
-#include "CDir.h"
-#include "MDS.h"
-#include "MDSMap.h"
-#include "MDLog.h"
-#include "AnchorClient.h"
-#include "Migrator.h"
-#include "Renamer.h"
-
-#include "include/filepath.h"
-
-#include "msg/Message.h"
-#include "msg/Messenger.h"
-
-#include "events/EString.h"
-#include "events/EUnlink.h"
-
-#include "messages/MRenameWarning.h"
-#include "messages/MRenameNotify.h"
-#include "messages/MRenameNotifyAck.h"
-#include "messages/MRename.h"
-#include "messages/MRenameAck.h"
-#include "messages/MRenameReq.h"
-#include "messages/MRenamePrep.h"
-
-
-
-void Renamer::dispatch(Message *m)
-{
- switch (m->get_type()) {
- case MSG_MDS_RENAMEWARNING:
- handle_rename_warning((MRenameWarning*)m);
- break;
- case MSG_MDS_RENAMENOTIFY:
- handle_rename_notify((MRenameNotify*)m);
- break;
- case MSG_MDS_RENAMENOTIFYACK:
- handle_rename_notify_ack((MRenameNotifyAck*)m);
- break;
- case MSG_MDS_RENAME:
- handle_rename((MRename*)m);
- break;
- case MSG_MDS_RENAMEREQ:
- handle_rename_req((MRenameReq*)m);
- break;
- case MSG_MDS_RENAMEPREP:
- handle_rename_prep((MRenamePrep*)m);
- break;
- case MSG_MDS_RENAMEACK:
- handle_rename_ack((MRenameAck*)m);
- break;
-
- default:
- assert(0);
- }
-}
-
-
-// renaming!
-
-
-/*
- fix_renamed_dir():
-
- caller has already:
- - relinked inode in new location
- - fixed in->is_auth()
- - set dir_auth, if appropriate
-
- caller has not:
- - touched in->dir
- - updated import/export tables
-*/
-void Renamer::fix_renamed_dir(CDir *srcdir,
- CInode *in,
- CDir *destdir,
- bool authchanged, // _inode_ auth
- int dir_auth) // dir auth (for certain cases)
-{
- dout(7) << "fix_renamed_dir on " << *in << endl;
- dout(7) << "fix_renamed_dir on " << *in->dir << endl;
-
-
- assert(0); // rewrite .
-
- // 1- fix subtree tree.
- // 2- adjust subtree auth.
-
- /*
- if (in->dir->is_auth()) {
- // dir ours
- dout(7) << "dir is auth" << endl;
- assert(!in->dir->is_export());
-
- if (in->is_auth()) {
- // inode now ours
- if (authchanged) {
- // inode _was_ replica, now ours
- dout(7) << "inode was replica, now ours." << endl;
- cache->adjust_subtree_auth(dir, mds->get_nodeid());
- } else {
- // inode was ours, still ours.
- dout(7) << "inode was ours, still ours." << endl;
-
- assert(!in->dir->is_import());
- assert(in->dir->get_dir_auth().first == CDIR_AUTH_PARENT);
-
- // move any exports nested beneath me?
- CDir *newcon = cache->get_auth_container(in->dir);
- assert(newcon);
- CDir *oldcon = cache->get_auth_container(srcdir);
- assert(oldcon);
- if (newcon != oldcon) {
- dout(7) << "moving nested exports under new container" << endl;
- set<CDir*> nested;
- cache->find_nested_exports_under(oldcon, in->dir, nested);
- for (set<CDir*>::iterator it = nested.begin();
- it != nested.end();
- it++) {
- dout(7) << "moving nested export " << *it << " under new container" << endl;
- cache->nested_exports[oldcon].erase(*it);
- cache->nested_exports[newcon].insert(*it);
- }
- }
- }
-
- } else {
- // inode now replica
-
- if (authchanged) {
- // inode was ours, but now replica
- dout(7) << "inode was ours, now replica. adding to import list." << endl;
-
- // i am now an import
- cache->imports.insert(in->dir);
- in->dir->state_set(CDir::STATE_IMPORT);
- in->dir->get(CDir::PIN_IMPORT);
-
- in->dir->set_dir_auth( mds->get_nodeid() );
- dout(7) << " fixing dir_auth to be " << in->dir->get_dir_auth() << endl;
-
- // find old import
- CDir *oldcon = cache->get_auth_container(srcdir);
- assert(oldcon);
- dout(7) << " oldcon is " << *oldcon << endl;
-
- // move nested exports under me
- set<CDir*> nested;
- cache->find_nested_exports_under(oldcon, in->dir, nested);
- for (set<CDir*>::iterator it = nested.begin();
- it != nested.end();
- it++) {
- dout(7) << "moving nested export " << *it << " under me" << endl;
- cache->nested_exports[oldcon].erase(*it);
- cache->nested_exports[in->dir].insert(*it);
- }
-
- } else {
- // inode was replica, still replica
- dout(7) << "inode was replica, still replica. doing nothing." << endl;
- assert(in->dir->is_import());
-
- // verify dir_auth
- assert(in->dir->get_dir_auth().first == mds->get_nodeid()); // me, because i'm auth for dir.
- assert(in->authority() != in->dir->get_dir_auth()); // inode not me.
- }
-
- assert(in->dir->is_import());
- }
-
- } else {
- // dir is not ours
- dout(7) << "dir is not auth" << endl;
-
- if (in->is_auth()) {
- // inode now ours
-
- if (authchanged) {
- // inode was replica, now ours
- dout(7) << "inode was replica, now ours. now an export." << endl;
- assert(!in->dir->is_export());
-
- // now export
- cache->exports.insert(in->dir);
- in->dir->state_set(CDir::STATE_EXPORT);
- in->dir->get(CDir::PIN_EXPORT);
-
- assert(dir_auth >= 0); // better be defined
- in->dir->set_dir_auth( dir_auth );
- dout(7) << " fixing dir_auth to be " << in->dir->get_dir_auth() << endl;
-
- CDir *newcon = cache->get_auth_container(in->dir);
- assert(newcon);
- cache->nested_exports[newcon].insert(in->dir);
-
- } else {
- // inode was ours, still ours
- dout(7) << "inode was ours, still ours. did my import change?" << endl;
-
- // sanity
- assert(in->dir->is_export());
- assert(in->dir->get_dir_auth().first >= 0);
- assert(in->dir->get_dir_auth() != in->authority());
-
- // moved under new import?
- CDir *oldcon = cache->get_auth_container(srcdir);
- CDir *newcon = cache->get_auth_container(in->dir);
- if (oldcon != newcon) {
- dout(7) << "moving myself under new import " << *newcon << endl;
- cache->nested_exports[oldcon].erase(in->dir);
- cache->nested_exports[newcon].insert(in->dir);
- }
- }
-
- assert(in->dir->is_export());
- } else {
- // inode now replica
-
- if (authchanged) {
- // inode was ours, now replica
- dout(7) << "inode was ours, now replica. removing from export list." << endl;
- assert(in->dir->is_export());
-
- // remove from export list
- cache->exports.erase(in->dir);
- in->dir->state_clear(CDir::STATE_EXPORT);
- in->dir->put(CDir::PIN_EXPORT);
-
- CDir *oldcon = cache->get_auth_container(srcdir);
- assert(oldcon);
- assert(cache->nested_exports[oldcon].count(in->dir) == 1);
- cache->nested_exports[oldcon].erase(in->dir);
-
- // simplify dir_auth
- if (in->authority() == in->dir->authority()) {
- in->dir->set_dir_auth( CDIR_AUTH_PARENT );
- dout(7) << "simplified dir_auth to -1, inode auth is (also) " << in->authority() << endl;
- } else {
- assert(in->dir->get_dir_auth().first >= 0); // someone else's export,
- }
-
- } else {
- // inode was replica, still replica
- dout(7) << "inode was replica, still replica. do nothing." << endl;
-
- // fix dir_auth?
- if (in->authority().first == dir_auth)
- in->dir->set_dir_auth( CDIR_AUTH_PARENT );
- else
- in->dir->set_dir_auth( dir_auth );
- dout(7) << " fixing dir_auth to be " << dir_auth << endl;
-
- // do nothing.
- }
-
- assert(!in->dir->is_export());
- }
- }
- */
- cache->show_subtrees();
-}
-
-/*
- * when initiator gets an ack back for a foreign rename
- */
-
-class C_MDC_RenameNotifyAck : public Context {
- Renamer *rn;
- CInode *in;
- int initiator;
-
-public:
- C_MDC_RenameNotifyAck(Renamer *r,
- CInode *i, int init) : rn(r), in(i), initiator(init) {}
- void finish(int r) {
- rn->file_rename_ack(in, initiator);
- }
-};
-
-
-
-/************** initiator ****************/
-
-/*
- * when we get MRenameAck (and rename is done, notifies gone out+acked, etc.)
- */
-class C_MDC_RenameAck : public Context {
- Renamer *mdc;
- CDir *srcdir;
- CInode *in;
- Context *c;
-public:
- C_MDC_RenameAck(Renamer *mdc, CDir *srcdir, CInode *in, Context *c) {
- this->mdc = mdc;
- this->srcdir = srcdir;
- this->in = in;
- this->c = c;
- }
- void finish(int r) {
- mdc->file_rename_finish(srcdir, in, c);
- }
-};
-
-
-void Renamer::file_rename(CDentry *srcdn, CDentry *destdn, Context *onfinish)
-{
- assert(srcdn->is_xlocked()); // by me
- assert(destdn->is_xlocked()); // by me
-
- CDir *srcdir = srcdn->dir;
- string srcname = srcdn->name;
-
- CDir *destdir = destdn->dir;
- string destname = destdn->name;
-
- CInode *in = srcdn->inode;
- //Message *req = srcdn->xlockedby;
-
-
- // determine the players
- int srcauth = srcdir->dentry_authority(srcdn->name).first;
- int destauth = destdir->dentry_authority(destname).first;
-
-
- // FOREIGN rename?
- if (srcauth != mds->get_nodeid() ||
- destauth != mds->get_nodeid()) {
- dout(7) << "foreign rename. srcauth " << srcauth << ", destauth " << destauth << ", isdir " << srcdn->inode->is_dir() << endl;
-
- string destpath;
- destdn->make_path(destpath);
-
- if (destauth != mds->get_nodeid()) {
- // make sure dest has dir open.
- dout(7) << "file_rename i'm not dest auth. sending MRenamePrep to " << destauth << endl;
-
- // prep dest first, they must have the dir open! rest will follow.
- string srcpath;
- srcdn->make_path(srcpath);
-
- MRenamePrep *m = new MRenamePrep(mds->get_nodeid(), // i'm the initiator
- srcdir->ino(), srcname, srcpath,
- destdir->ino(), destname, destpath,
- srcauth); // tell dest who src is (maybe even me)
- mds->send_message_mds(m, destauth, MDS_PORT_CACHE);
-
- cache->show_subtrees();
-
- }
-
- else if (srcauth != mds->get_nodeid()) {
- if (destauth == mds->get_nodeid()) {
- dout(7) << "file_rename dest auth, not src auth. sending MRenameReq" << endl;
- } else {
- dout(7) << "file_rename neither src auth nor dest auth. sending MRenameReq" << endl;
- }
-
- // srcdn not important on destauth, just request
- MRenameReq *m = new MRenameReq(mds->get_nodeid(), // i'm the initiator
- srcdir->ino(), srcname,
- destdir->ino(), destname, destpath, destauth); // tell src who dest is (they may not know)
- mds->send_message_mds(m, srcauth, MDS_PORT_CACHE);
- }
-
- else
- assert(0);
-
- // set waiter on the inode (is this the best place?)
- in->add_waiter(CInode::WAIT_RENAMEACK,
- new C_MDC_RenameAck(this,
- srcdir, in, onfinish));
- return;
- }
-
- // LOCAL rename!
- assert(srcauth == mds->get_nodeid() && destauth == mds->get_nodeid());
- dout(7) << "file_rename src and dest auth, renaming locally (easy!)" << endl;
-
- // update our cache
- if (destdn->inode && destdn->inode->is_dirty())
- destdn->inode->mark_clean();
-
- cache->rename_file(srcdn, destdn);
-
- // update imports/exports?
- if (in->is_dir() && in->dir)
- fix_renamed_dir(srcdir, in, destdir, false); // auth didnt change
-
- // mark dentries dirty
- srcdn->_mark_dirty(); // fixme
- destdn->_mark_dirty(); // fixme
- in->_mark_dirty(); // fixme
-
-
- // local, restrict notify to ppl with open dirs
- set<int> notify;
- for (map<int,int>::iterator it = srcdir->replicas_begin();
- it != srcdir->replicas_end();
- ++it)
- notify.insert(it->first);
- for (map<int,int>::iterator it = destdir->replicas_begin();
- it != destdir->replicas_end();
- it++)
- if (notify.count(it->first) == 0) notify.insert(it->first);
-
- if (notify.size()) {
- // warn + notify
- file_rename_warn(in, notify);
- file_rename_notify(in, srcdir, srcname, destdir, destname, notify, mds->get_nodeid());
-
- // wait for MRenameNotifyAck's
- in->add_waiter(CInode::WAIT_RENAMENOTIFYACK,
- new C_MDC_RenameNotifyAck(this, in, mds->get_nodeid())); // i am initiator
-
- // wait for finish
- in->add_waiter(CInode::WAIT_RENAMEACK,
- new C_MDC_RenameAck(this, srcdir, in, onfinish));
- } else {
- // sweet, no notify necessary, we're done!
- file_rename_finish(srcdir, in, onfinish);
- }
-}
-
-void Renamer::handle_rename_ack(MRenameAck *m)
-{
- CInode *in = cache->get_inode(m->get_ino());
- assert(in);
-
- dout(7) << "handle_rename_ack on " << *in << endl;
-
- // all done!
- in->finish_waiting(CInode::WAIT_RENAMEACK);
-
- delete m;
-}
-
-void Renamer::file_rename_finish(CDir *srcdir, CInode *in, Context *c)
-{
- dout(10) << "file_rename_finish on " << *in << endl;
-
- // did i empty out an imported dir? FIXME this check should go somewhere else???
- if (srcdir->is_auth() && !srcdir->inode->is_auth() && srcdir->get_size() == 0)
- cache->migrator->export_empty_import(srcdir);
-
- // finish our caller
- if (c) {
- c->finish(0);
- delete c;
- }
-}
-
-
-/************* src **************/
-
-
-/** handle_rename_req
- * received by auth of src dentry (from init, or destauth if dir).
- * src may not have dest dir open.
- * src will export inode, unlink|rename, and send MRename to dest.
- */
-void Renamer::handle_rename_req(MRenameReq *m)
-{
- // i am auth, i will have it.
- CInode *srcdiri = cache->get_inode(m->get_srcdirino());
- CDir *srcdir = srcdiri->dir;
- CDentry *srcdn = srcdir->lookup(m->get_srcname());
- assert(srcdn);
-
- // do it
- file_rename_foreign_src(srcdn,
- m->get_destdirino(), m->get_destname(), m->get_destpath(), m->get_destauth(),
- m->get_initiator());
- delete m;
-}
-
-
-void Renamer::file_rename_foreign_src(CDentry *srcdn,
- inodeno_t destdirino, string& destname, string& destpath, int destauth,
- int initiator)
-{
- dout(7) << "file_rename_foreign_src " << *srcdn << endl;
-
- CDir *srcdir = srcdn->dir;
- string srcname = srcdn->name;
-
- // (we're basically exporting this inode)
- CInode *in = srcdn->inode;
- assert(in);
- assert(in->is_auth());
-
- if (in->is_dir()) cache->show_subtrees();
-
- // encode and export inode state
- bufferlist inode_state;
- cache->migrator->encode_export_inode(in, inode_state, destauth);
-
- // send
- MRename *m = new MRename(initiator,
- srcdir->ino(), srcdn->name, destdirino, destname,
- inode_state);
- mds->send_message_mds(m, destauth, MDS_PORT_CACHE);
-
- // have dest?
- CInode *destdiri = cache->get_inode(m->get_destdirino());
- CDir *destdir = 0;
- if (destdiri) destdir = destdiri->dir;
- CDentry *destdn = 0;
- if (destdir) destdn = destdir->lookup(m->get_destname());
-
- // discover src
- if (!destdn) {
- dout(7) << "file_rename_foreign_src doesn't have destdn, discovering " << destpath << endl;
-
- filepath destfilepath = destpath;
- vector<CDentry*> trace;
- int r = cache->path_traverse(destfilepath, trace, true,
- m, new C_MDS_RetryMessage(mds, m),
- MDS_TRAVERSE_DISCOVER);
- assert(r>0);
- return;
- }
-
- assert(destdn);
-
- // update our cache
- cache->rename_file(srcdn, destdn);
-
- // update imports/exports?
- if (in->is_dir() && in->dir)
- fix_renamed_dir(srcdir, in, destdir, true); // auth changed
-
- srcdn->_mark_dirty(); // fixme
-
- // proxy!
- //in->state_set(CInode::STATE_PROXY);
- //in->get(CInode::PIN_PROXY);
-
- // generate notify list (everybody but src|dst) and send warnings
- set<int> notify;
- for (int i=0; i<mds->get_mds_map()->get_num_mds(); i++) {
- if (i != mds->get_nodeid() && // except the source
- i != destauth) // and the dest
- notify.insert(i);
- }
- file_rename_warn(in, notify);
-
-
- // wait for MRenameNotifyAck's
- in->add_waiter(CInode::WAIT_RENAMENOTIFYACK,
- new C_MDC_RenameNotifyAck(this, in, initiator));
-}
-
-void Renamer::file_rename_warn(CInode *in,
- set<int>& notify)
-{
- // note gather list
- rename_waiting_for_ack[in->ino()] = notify;
-
- // send
- for (set<int>::iterator it = notify.begin();
- it != notify.end();
- it++) {
- dout(10) << "file_rename_warn to " << *it << " for " << *in << endl;
- mds->send_message_mds(new MRenameWarning(in->ino()), *it, MDS_PORT_CACHE);
- }
-}
-
-
-void Renamer::handle_rename_notify_ack(MRenameNotifyAck *m)
-{
- CInode *in = cache->get_inode(m->get_ino());
- assert(in);
- dout(7) << "handle_rename_notify_ack on " << *in << endl;
-
- int source = m->get_source().num();
- rename_waiting_for_ack[in->ino()].erase(source);
- if (rename_waiting_for_ack[in->ino()].empty()) {
- // last one!
- rename_waiting_for_ack.erase(in->ino());
- in->finish_waiting(CInode::WAIT_RENAMENOTIFYACK, 0);
- } else {
- dout(7) << "still waiting for " << rename_waiting_for_ack[in->ino()] << endl;
- }
-}
-
-
-void Renamer::file_rename_ack(CInode *in, int initiator)
-{
- // we got all our MNotifyAck's.
-
- // was i proxy (if not, it's cuz this was a local rename)
- /*if (in->state_test(CInode::STATE_PROXY)) {
- dout(10) << "file_rename_ack clearing proxy bit on " << *in << endl;
- in->state_clear(CInode::STATE_PROXY);
- in->put(CInode::PIN_PROXY);
- }*/
-
- // done!
- if (initiator == mds->get_nodeid()) {
- // it's me, finish
- dout(7) << "file_rename_ack i am initiator, finishing" << endl;
- in->finish_waiting(CInode::WAIT_RENAMEACK);
- } else {
- // send ack
- dout(7) << "file_rename_ack sending MRenameAck to initiator " << initiator << endl;
- mds->send_message_mds(new MRenameAck(in->ino()), initiator, MDS_PORT_CACHE);
- }
-}
-
-
-
-
-/************ dest *************/
-
-/** handle_rename_prep
- * received by auth of dest dentry to make sure they have src + dir open.
- * this is so that when they get the inode and dir, they can update exports etc properly.
- * will send MRenameReq to src.
- */
-void Renamer::handle_rename_prep(MRenamePrep *m)
-{
- // open src
- filepath srcpath = m->get_srcpath();
- vector<CDentry*> trace;
- int r = cache->path_traverse(srcpath, trace, false,
- m, new C_MDS_RetryMessage(mds, m),
- MDS_TRAVERSE_DISCOVER);
-
- if (r>0) return;
-
- // ok!
- CInode *srcin = trace[trace.size()-1]->inode;
- assert(srcin);
-
- dout(7) << "handle_rename_prep have srcin " << *srcin << endl;
-
- if (srcin->is_dir()) {
- if (!srcin->dir) {
- dout(7) << "handle_rename_prep need to open dir" << endl;
- cache->open_remote_dir(srcin, frag_t(), // FIXME dirfrag
- new C_MDS_RetryMessage(mds,m));
- return;
- }
-
- dout(7) << "handle_rename_prep have dir " << *srcin->dir << endl;
- }
-
- // pin
- srcin->get(CInode::PIN_RENAMESRC);
-
- // send rename request
- MRenameReq *req = new MRenameReq(m->get_initiator(), // i'm the initiator
- m->get_srcdirino(), m->get_srcname(),
- m->get_destdirino(), m->get_destname(), m->get_destpath(),
- mds->get_nodeid()); // i am dest
- mds->send_message_mds(req, m->get_srcauth(), MDS_PORT_CACHE);
- delete m;
- return;
-}
-
-
-
-/** handle_rename
- * received by auth of dest dentry. includes exported inode info.
- * dest may not have srcdir open.
- */
-void Renamer::handle_rename(MRename *m)
-{
- // srcdn (required)
- CInode *srcdiri = cache->get_inode(m->get_srcdirino());
- CDir *srcdir = srcdiri->dir;
- CDentry *srcdn = srcdir->lookup(m->get_srcname());
- string srcname = srcdn->name;
- assert(srcdn && srcdn->inode);
-
- dout(7) << "handle_rename srcdn " << *srcdn << endl;
-
- // destdn (required). i am auth, so i will have it.
- CInode *destdiri = cache->get_inode(m->get_destdirino());
- CDir *destdir = destdiri->dir;
- CDentry *destdn = destdir->lookup(m->get_destname());
- string destname = destdn->name;
- assert(destdn);
-
- dout(7) << "handle_rename destdn " << *destdn << endl;
-
- // note old dir auth
- int old_dir_auth = -1;
- if (srcdn->inode->dir) old_dir_auth = srcdn->inode->dir->authority().first;
-
- // rename replica into position
- if (destdn->inode && destdn->inode->is_dirty())
- destdn->inode->mark_clean();
-
- cache->rename_file(srcdn, destdn);
-
- // decode + import inode (into new location start)
- int off = 0;
- // HACK
- bufferlist bufstate;
- bufstate.claim_append(m->get_inode_state());
- cache->migrator->decode_import_inode(destdn, bufstate, off, m->get_source().num());
-
- CInode *in = destdn->inode;
- assert(in);
-
- // update imports/exports?
- if (in->is_dir()) {
- assert(in->dir); // i had better already ahve it open.. see MRenamePrep
- fix_renamed_dir(srcdir, in, destdir, true, // auth changed
- old_dir_auth); // src is possibly new dir auth.
- }
-
- // mark dirty
- destdn->_mark_dirty(); // fixme
- in->_mark_dirty(); // fixme
-
- // unpin
- in->put(CInode::PIN_RENAMESRC);
-
- // ok, send notifies.
- set<int> notify;
- for (int i=0; i<mds->get_mds_map()->get_num_mds(); i++) {
- if (i != m->get_source().num() && // except the source
- i != mds->get_nodeid()) // and the dest
- notify.insert(i);
- }
- file_rename_notify(in, srcdir, srcname, destdir, destname, notify, m->get_source().num());
-
- delete m;
-}
-
-
-void Renamer::file_rename_notify(CInode *in,
- CDir *srcdir, string& srcname, CDir *destdir, string& destname,
- set<int>& notify,
- int srcauth)
-{
- /* NOTE: notify list might include myself */
-
- // tell
- string destdirpath;
- destdir->inode->make_path(destdirpath);
-
- for (set<int>::iterator it = notify.begin();
- it != notify.end();
- it++) {
- dout(10) << "file_rename_notify to " << *it << " for " << *in << endl;
- mds->send_message_mds(new MRenameNotify(in->ino(),
- srcdir->ino(),
- srcname,
- destdir->ino(),
- destdirpath,
- destname,
- srcauth),
- *it, MDS_PORT_CACHE);
- }
-}
-
-
-
-/************** bystanders ****************/
-
-void Renamer::handle_rename_warning(MRenameWarning *m)
-{
- // add to warning list
- stray_rename_warnings.insert( m->get_ino() );
-
- // did i already see the notify?
- if (stray_rename_notifies.count(m->get_ino())) {
- // i did, we're good.
- dout(7) << "handle_rename_warning on " << m->get_ino() << ". already got notify." << endl;
-
- handle_rename_notify(stray_rename_notifies[m->get_ino()]);
- stray_rename_notifies.erase(m->get_ino());
- } else {
- dout(7) << "handle_rename_warning on " << m->get_ino() << ". waiting for notify." << endl;
- }
-
- // done
- delete m;
-}
-
-
-void Renamer::handle_rename_notify(MRenameNotify *m)
-{
- // FIXME: when we do hard links, i think we need to
- // have srcdn and destdn both, or neither, always!
-
- // did i see the warning yet?
- if (!stray_rename_warnings.count(m->get_ino())) {
- // wait for it.
- dout(7) << "handle_rename_notify on " << m->get_ino() << ", waiting for warning." << endl;
- stray_rename_notifies[m->get_ino()] = m;
- return;
- }
-
- dout(7) << "handle_rename_notify dir " << m->get_srcdirino() << " dn " << m->get_srcname() << " to dir " << m->get_destdirino() << " dname " << m->get_destname() << endl;
-
- // src
- CInode *srcdiri = cache->get_inode(m->get_srcdirino());
- CDir *srcdir = 0;
- if (srcdiri) srcdir = srcdiri->dir;
- CDentry *srcdn = 0;
- if (srcdir) srcdn = srcdir->lookup(m->get_srcname());
-
- // dest
- CInode *destdiri = cache->get_inode(m->get_destdirino());
- CDir *destdir = 0;
- if (destdiri) destdir = destdiri->dir;
- CDentry *destdn = 0;
- if (destdir) destdn = destdir->lookup(m->get_destname());
-
- // have both?
- list<Context*> finished;
- if (srcdn && destdir) {
- CInode *in = srcdn->inode;
-
- int old_dir_auth = -1;
- if (in && in->dir) old_dir_auth = in->dir->authority().first;
-
- if (!destdn) {
- destdn = destdir->add_dentry(m->get_destname()); // create null dentry
- destdn->lockstate = DN_LOCK_XLOCK; // that's xlocked!
- }
-
- dout(7) << "handle_rename_notify renaming " << *srcdn << " to " << *destdn << endl;
-
- if (in) {
- cache->rename_file(srcdn, destdn);
-
- // update imports/exports?
- if (in && in->is_dir() && in->dir) {
- fix_renamed_dir(srcdir, in, destdir, false, old_dir_auth); // auth didnt change
- }
- } else {
- dout(7) << " i don't have the inode (just null dentries)" << endl;
- }
-
- }
-
- else if (srcdn) {
- dout(7) << "handle_rename_notify no dest, but have src" << endl;
- dout(7) << "srcdn is " << *srcdn << endl;
-
- if (destdiri) {
- dout(7) << "have destdiri, opening dir " << *destdiri << endl;
- cache->open_remote_dir(destdiri, frag_t(), // FIXME dirfrag
- new C_MDS_RetryMessage(mds,m));
- } else {
- filepath destdirpath = m->get_destdirpath();
- dout(7) << "don't have destdiri even, doing traverse+discover on " << destdirpath << endl;
-
- vector<CDentry*> trace;
- int r = cache->path_traverse(destdirpath, trace, true,
- m, new C_MDS_RetryMessage(mds, m),
- MDS_TRAVERSE_DISCOVER);
- assert(r>0);
- }
- return;
- }
-
- else if (destdn) {
- dout(7) << "handle_rename_notify unlinking dst only " << *destdn << endl;
- if (destdn->inode) {
- destdir->unlink_inode(destdn);
- }
- }
-
- else {
- dout(7) << "handle_rename_notify didn't have srcdn or destdn" << endl;
- assert(srcdn == 0 && destdn == 0);
- }
-
- mds->queue_finished(finished);
-
-
- // ack
- dout(10) << "sending RenameNotifyAck back to srcauth " << m->get_srcauth() << endl;
- MRenameNotifyAck *ack = new MRenameNotifyAck(m->get_ino());
- mds->send_message_mds(ack, m->get_srcauth(), MDS_PORT_CACHE);
-
-
- stray_rename_warnings.erase( m->get_ino() );
- delete m;
-}
-
-
-
-
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-#ifndef __MDS_RENAMER_H
-#define __MDS_RENAMER_H
-
-#include "include/types.h"
-
-#include <map>
-#include <set>
-using std::map;
-using std::set;
-
-class MDS;
-class MDCache;
-class CDentry;
-class CInode;
-class CDir;
-
-class Message;
-class MRenameWarning;
-class MRenameNotify;
-class MRenameNotifyAck;
-class MRename;
-class MRenamePrep;
-class MRenameReq;
-class MRenameAck;
-
-class Renamer {
- MDS *mds;
- MDCache *cache;
-
- // rename fun
- set<inodeno_t> stray_rename_warnings; // notifies i haven't seen
- map<inodeno_t, MRenameNotify*> stray_rename_notifies;
-
- map<inodeno_t, set<int> > rename_waiting_for_ack;
-
-
-
- void fix_renamed_dir(CDir *srcdir,
- CInode *in,
- CDir *destdir,
- bool authchanged, // _inode_ auth changed
- int dirauth=-1); // dirauth (for certain cases)
-
-
-public:
- Renamer(MDS *m, MDCache *c) : mds(m), cache(c) {}
-
- void dispatch(Message *m);
-
- // RENAME
- // initiator
- public:
- void file_rename(CDentry *srcdn, CDentry *destdn, Context *c);
- protected:
- void handle_rename_ack(MRenameAck *m); // dest -> init (almost always)
- void file_rename_finish(CDir *srcdir, CInode *in, Context *c);
- friend class C_MDC_RenameAck;
-
- // src
- void handle_rename_req(MRenameReq *m); // dest -> src
- void file_rename_foreign_src(CDentry *srcdn,
- inodeno_t destdirino, string& destname, string& destpath, int destauth,
- int initiator);
- void file_rename_warn(CInode *in, set<int>& notify);
- void handle_rename_notify_ack(MRenameNotifyAck *m); // bystanders -> src
- void file_rename_ack(CInode *in, int initiator);
- friend class C_MDC_RenameNotifyAck;
-
- // dest
- void handle_rename_prep(MRenamePrep *m); // init -> dest
- void handle_rename(MRename *m); // src -> dest
- void file_rename_notify(CInode *in,
- CDir *srcdir, string& srcname, CDir *destdir, string& destname,
- set<int>& notify, int srcauth);
-
- // bystander
- void handle_rename_warning(MRenameWarning *m); // src -> bystanders
- void handle_rename_notify(MRenameNotify *m); // dest -> bystanders
-
-
-};
-
-#endif
-
-
case LOCK_GLOCKT: return "gLockT";
case LOCK_TEMPSYNC: return "Tempsync";
- default: assert(0);
+ default: assert(0); return 0;
}
}
return LOCK_LOCK;
default:
assert(0);
+ return 0;
}
}
#define derr(l) if (l<=g_conf.debug || l <= g_conf.debug_mds) cout << g_clock.now() << " mds" << mds->get_nodeid() << ".server "
+void Server::reopen_logger()
+{
+ static LogType mdserver_logtype;
+ static bool didit = false;
+ if (!didit) {
+ didit = true;
+ mdserver_logtype.add_inc("hcreq"); // handle client req
+ mdserver_logtype.add_inc("hsreq"); // slave
+ mdserver_logtype.add_inc("hcsess"); // client session
+ mdserver_logtype.add_inc("dcreq"); // dispatch client req
+ mdserver_logtype.add_inc("dsreq"); // slave
+ }
+
+ if (logger) {
+ logger->flush();
+ delete logger;
+ }
+
+ // logger
+ char name[80];
+ sprintf(name, "mds%d.server", mds->get_nodeid());
+ logger = new Logger(name, &mdserver_logtype);
+}
+
+
void Server::dispatch(Message *m)
{
switch (m->get_type()) {
mdlog->submit_entry(new ESession(m->get_source_inst(), open, cmapv),
new C_MDS_session_finish(mds, m->get_source_inst(), open, cmapv));
delete m;
+
+ if (logger) logger->inc("hcsess");
}
void Server::_session_logged(entity_inst_t client_inst, bool open, version_t cmapv)
dout(4) << "handle_client_request " << *req << endl;
int client = req->get_client();
+ if (logger) logger->inc("hcreq");
+
if (!mds->is_active()) {
dout(5) << " not active, discarding client request." << endl;
delete req;
{
MClientRequest *req = mdr->client_request;
+ if (logger) logger->inc("dcreq");
+
if (mdr->ref) {
dout(7) << "dispatch_client_request " << *req << " ref " << *mdr->ref << endl;
} else {
// funky.
case MDS_OP_OPEN:
- if ((req->args.open.flags & O_CREAT) &&
- !mdr->ref)
+ if (req->args.open.flags & O_CREAT)
handle_client_openc(mdr);
else
handle_client_open(mdr);
dout(4) << "handle_slave_request " << m->get_reqid() << " from " << m->get_source() << endl;
int from = m->get_source().num();
+ if (logger) logger->inc("hsreq");
+
// reply?
if (m->is_reply()) {
return;
}
+ if (logger) logger->inc("dsreq");
+
switch (mdr->slave_request->get_op()) {
case MMDSSlaveRequest::OP_XLOCK:
{
}
// create
- dn = dir->add_dentry(dname, 0);
+ dn = dir->add_null_dentry(dname);
dn->mark_new();
dout(10) << "prepare_null_dentry added " << *dn << endl;
in->inode.ctime = in->inode.mtime = in->inode.atime = mdr->now; // now
dout(10) << "prepare_new_inode " << *in << endl;
- // bump modify pop
- mds->balancer->hit_dir(dir, META_POP_DWR);
-
return in;
}
set<SimpleLock*> xlocks = mdr->xlocks;
int mask = req->args.stat.mask;
- if (mask & INODE_MASK_LINK) rdlocks.insert(&ref->linklock);
- if (mask & INODE_MASK_AUTH) rdlocks.insert(&ref->authlock);
+ if (mask & STAT_MASK_LINK) rdlocks.insert(&ref->linklock);
+ if (mask & STAT_MASK_AUTH) rdlocks.insert(&ref->authlock);
if (ref->is_file() &&
- mask & INODE_MASK_FILE) rdlocks.insert(&ref->filelock);
+ mask & STAT_MASK_FILE) rdlocks.insert(&ref->filelock);
if (ref->is_dir() &&
- mask & INODE_MASK_MTIME) rdlocks.insert(&ref->dirlock);
+ mask & STAT_MASK_MTIME) rdlocks.insert(&ref->dirlock);
if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
return;
// apply
in->pop_and_dirty_projected_inode();
+ mds->balancer->hit_inode(mdr->now, in, META_POP_IWR);
+
// reply
MClientReply *reply = new MClientReply(mdr->client_request, 0);
reply->set_result(0);
if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
return;
- mds->balancer->hit_inode(cur, META_POP_IWR);
-
// project update
inode_t *pi = cur->project_inode();
pi->mtime = req->args.utime.mtime;
pi->ctime = g_clock.real_now();
// log + wait
- EUpdate *le = new EUpdate("utime");
+ EUpdate *le = new EUpdate(mdlog, "utime");
le->metablob.add_client_req(req->get_reqid());
le->metablob.add_dir_context(cur->get_parent_dir());
le->metablob.add_primary_dentry(cur->parent, true, 0, pi);
if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
return;
- mds->balancer->hit_inode(cur, META_POP_IWR);
-
// project update
inode_t *pi = cur->project_inode();
pi->mode =
pi->ctime = g_clock.real_now();
// log + wait
- EUpdate *le = new EUpdate("chmod");
+ EUpdate *le = new EUpdate(mdlog, "chmod");
le->metablob.add_client_req(req->get_reqid());
le->metablob.add_dir_context(cur->get_parent_dir());
le->metablob.add_primary_dentry(cur->parent, true, 0, pi);
if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
return;
- mds->balancer->hit_inode(cur, META_POP_IWR);
-
// project update
inode_t *pi = cur->project_inode();
pi->uid = MAX(req->args.chown.uid, 0);
pi->ctime = g_clock.real_now();
// log + wait
- EUpdate *le = new EUpdate("chown");
+ EUpdate *le = new EUpdate(mdlog, "chown");
le->metablob.add_client_req(req->get_reqid());
le->metablob.add_dir_context(cur->get_parent_dir());
le->metablob.add_primary_dentry(cur->parent, true, 0, pi);
// READDIR
int Server::encode_dir_contents(CDir *dir,
- list<InodeStat*>& inls,
- list<string>& dnls)
+ list<string>& dnls,
+ list<InodeStat*>& inls)
{
int numfiles = 0;
if (dn->is_null()) continue;
CInode *in = dn->inode;
- if (!in)
- continue; // hmm, fixme!, what about REMOTE links?
-
- dout(12) << "including inode " << *in << endl;
+ InodeStat *st;
+ if (in) {
+ dout(12) << "including inode " << *in << endl;
+
+ // add this item
+ // note: InodeStat makes note of whether inode data is readable.
+ st = new InodeStat(in, mds->get_nodeid());
+ } else {
+ assert(dn->is_remote());
+ dout(12) << "including inode-less (remote) dentry " << *dn << endl;
+ st = new InodeStat;
+ st->mask = STAT_MASK_INO | STAT_MASK_TYPE;
+ memset(&st->inode, 0, sizeof(st->inode));
+ st->inode.ino = dn->get_remote_ino();
+ st->inode.mode = DT_TO_MODE(dn->get_remote_d_type());
+ }
- // add this item
- // note: InodeStat makes note of whether inode data is readable.
dnls.push_back( it->first );
- inls.push_back( new InodeStat(in, mds->get_nodeid()) );
+ inls.push_back(st);
numfiles++;
}
return numfiles;
if (!diri->is_dir()) {
// not a dir
dout(10) << "reply to " << *req << " readdir -ENOTDIR" << endl;
- reply_request(mdr, -ENOTDIR);
+ reply_request(mdr, -ENOTDIR, diri);
return;
}
// does the frag exist?
if (diri->dirfragtree[fg] != fg) {
dout(10) << "frag " << fg << " doesn't appear in fragtree " << diri->dirfragtree << endl;
- reply_request(mdr, -EAGAIN);
+ reply_request(mdr, -EAGAIN, diri);
return;
}
// build dir contents
list<InodeStat*> inls;
list<string> dnls;
- int numfiles = encode_dir_contents(dir, inls, dnls);
+ int numfiles = encode_dir_contents(dir, dnls, inls);
// . too
//dnls.push_back(".");
// yay, reply
MClientReply *reply = new MClientReply(req);
- reply->take_dir_items(inls, dnls, numfiles);
+ reply->take_dir_items(dnls, inls, numfiles);
dout(10) << "reply to " << *req << " readdir " << numfiles << " files" << endl;
- reply->set_result(fg);
-
- //balancer->hit_dir(diri->dir);
+ reply->set_result(0);
+
+ // bump popularity. NOTE: this doesn't quite capture it.
+ mds->balancer->hit_dir(g_clock.now(), dir, META_POP_IRD, numfiles);
// reply
reply_request(mdr, reply, diri);
assert(r == 0);
// link the inode
- dn->get_dir()->link_inode(dn, newi);
+ dn->get_dir()->link_primary_inode(dn, newi);
// dirty inode, dn, dir
newi->mark_dirty(newi->inode.version + 1);
mds->server->dirty_dn_diri(dn, dirpv, newi->inode.ctime);
// hit pop
- mds->balancer->hit_inode(newi, META_POP_IWR);
+ mds->balancer->hit_inode(mdr->now, newi, META_POP_IWR);
+ //mds->balancer->hit_dir(mdr->now, dn->get_dir(), META_POP_DWR);
// reply
MClientReply *reply = new MClientReply(mdr->client_request, 0);
assert(newi);
// it's a file.
+ newi->inode.rdev = req->args.mknod.rdev;
newi->inode.mode = req->args.mknod.mode;
newi->inode.mode &= ~INODE_TYPE_MASK;
newi->inode.mode |= INODE_MODE_FILE;
newi->inode.version = dn->pre_dirty() - 1;
// prepare finisher
- EUpdate *le = new EUpdate("mknod");
+ EUpdate *le = new EUpdate(mdlog, "mknod");
le->metablob.add_client_req(req->get_reqid());
le->metablob.add_allocated_ino(newi->ino(), mds->idalloc->get_version());
version_t dirpv = predirty_dn_diri(mdr, dn, &le->metablob); // dir mtime too
newdir->mark_dirty(newdir->pre_dirty());
// prepare finisher
- EUpdate *le = new EUpdate("mkdir");
+ EUpdate *le = new EUpdate(mdlog, "mkdir");
le->metablob.add_client_req(req->get_reqid());
le->metablob.add_allocated_ino(newi->ino(), mds->idalloc->get_version());
version_t dirpv = predirty_dn_diri(mdr, dn, &le->metablob); // dir mtime too
mdlog->submit_entry(le);
mdlog->wait_for_sync(new C_MDS_mknod_finish(mds, mdr, dn, newi, dirpv));
-
/* old export heuristic. pbly need to reimplement this at some point.
if (
diri->dir->is_auth() &&
newi->inode.version = dn->pre_dirty() - 1;
// prepare finisher
- EUpdate *le = new EUpdate("symlink");
+ EUpdate *le = new EUpdate(mdlog, "symlink");
le->metablob.add_client_req(req->get_reqid());
le->metablob.add_allocated_ino(newi->ino(), mds->idalloc->get_version());
version_t dirpv = predirty_dn_diri(mdr, dn, &le->metablob); // dir mtime too
pi->version = tipv;
// log + wait
- EUpdate *le = new EUpdate("link_local");
+ EUpdate *le = new EUpdate(mdlog, "link_local");
le->metablob.add_client_req(mdr->reqid);
version_t dirpv = predirty_dn_diri(mdr, dn, &le->metablob); // dir inode's mtime
le->metablob.add_dir_context(dn->get_dir());
dout(10) << "_link_local_finish " << *dn << " to " << *targeti << endl;
// link and unlock the NEW dentry
- dn->dir->link_inode(dn, targeti->ino());
+ dn->dir->link_remote_inode(dn, targeti->ino(), MODE_TO_DT(targeti->inode.mode));
dn->mark_dirty(dnpv);
// target inode
dirty_dn_diri(dn, dirpv, mdr->now);
// bump target popularity
- mds->balancer->hit_inode(targeti, META_POP_IWR);
+ mds->balancer->hit_inode(mdr->now, targeti, META_POP_IWR);
+ //mds->balancer->hit_dir(mdr->now, dn->get_dir(), META_POP_DWR);
// reply
MClientReply *reply = new MClientReply(mdr->client_request, 0);
dn->pre_dirty();
// add to event
- EUpdate *le = new EUpdate("link_remote");
+ EUpdate *le = new EUpdate(mdlog, "link_remote");
le->metablob.add_client_req(mdr->reqid);
version_t dirpv = predirty_dn_diri(mdr, dn, &le->metablob); // dir inode's mtime
le->metablob.add_dir_context(dn->get_dir());
dout(10) << "_link_remote_finish " << *dn << " to " << *targeti << endl;
// link the new dentry
- dn->dir->link_inode(dn, targeti->ino());
+ dn->dir->link_remote_inode(dn, targeti->ino(), MODE_TO_DT(targeti->inode.mode));
dn->mark_dirty(dpv);
// dir inode's mtime
dirty_dn_diri(dn, dirpv, mdr->now);
// bump target popularity
- mds->balancer->hit_inode(targeti, META_POP_IWR);
+ mds->balancer->hit_inode(mdr->now, targeti, META_POP_IWR);
+ //mds->balancer->hit_dir(mdr->now, dn->get_dir(), META_POP_DWR);
// reply
MClientReply *reply = new MClientReply(mdr->client_request, 0);
dout(10) << " projected inode " << pi << " v " << pi->version << endl;
// journal it
- ESlaveUpdate *le = new ESlaveUpdate("slave_link_prep", mdr->reqid, mdr->slave_to_mds, ESlaveUpdate::OP_PREPARE);
+ ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_link_prep", mdr->reqid, mdr->slave_to_mds, ESlaveUpdate::OP_PREPARE);
le->metablob.add_dir_context(targeti->get_parent_dir());
le->metablob.add_primary_dentry(dn, true, targeti, pi); // update old primary
mds->mdlog->submit_entry(le, new C_MDS_SlaveLinkPrep(this, mdr, targeti, old_ctime, inc));
// update the target
targeti->pop_and_dirty_projected_inode();
+ // hit pop
+ mds->balancer->hit_inode(mdr->now, targeti, META_POP_IWR);
+
// ack
MMDSSlaveRequest *reply = new MMDSSlaveRequest(mdr->reqid, MMDSSlaveRequest::OP_LINKPREPACK);
mds->send_message_mds(reply, mdr->slave_to_mds, MDS_PORT_SERVER);
ESlaveUpdate *le;
if (r == 0) {
// write a commit to the journal
- le = new ESlaveUpdate("slave_link_commit", mdr->reqid, mdr->slave_to_mds, ESlaveUpdate::OP_COMMIT);
+ le = new ESlaveUpdate(mdlog, "slave_link_commit", mdr->reqid, mdr->slave_to_mds, ESlaveUpdate::OP_COMMIT);
} else {
- le = new ESlaveUpdate("slave_link_rollback", mdr->reqid, mdr->slave_to_mds, ESlaveUpdate::OP_ROLLBACK);
+ le = new ESlaveUpdate(mdlog, "slave_link_rollback", mdr->reqid, mdr->slave_to_mds, ESlaveUpdate::OP_ROLLBACK);
// -- rollback in memory --
assert(targeti->inode.ctime == mdr->now);
// ok, let's do it.
// prepare log entry
- EUpdate *le = new EUpdate("unlink_local");
+ EUpdate *le = new EUpdate(mdlog, "unlink_local");
le->metablob.add_client_req(mdr->reqid);
version_t ipv = 0; // dirty inode version
if (mdr->dst_reanchor_atid)
le->metablob.add_anchor_transaction(mdr->dst_reanchor_atid);
- // finisher
- C_MDS_unlink_local_finish *fin = new C_MDS_unlink_local_finish(mds, mdr, dn, straydn,
- dirpv);
-
- journal_opens(); // journal pending opens, just in case
-
// log + wait
+ journal_opens(); // journal pending opens, just in case
mdlog->submit_entry(le);
- mdlog->wait_for_sync(fin);
-
- mds->balancer->hit_dir(dn->dir, META_POP_DWR);
+ mdlog->wait_for_sync(new C_MDS_unlink_local_finish(mds, mdr, dn, straydn,
+ dirpv));
}
void Server::_unlink_local_finish(MDRequest *mdr,
dn->dir->unlink_inode(dn);
// relink as stray? (i.e. was primary link?)
- if (straydn) straydn->dir->link_inode(straydn, in);
+ if (straydn) straydn->dir->link_primary_inode(straydn, in);
// nlink--, dirty old dentry
in->pop_and_dirty_projected_inode();
// dir inode's mtime
dirty_dn_diri(dn, dirpv, mdr->now);
- // bump target popularity
- mds->balancer->hit_dir(dn->dir, META_POP_DWR);
-
// share unlink news with replicas
for (map<int,int>::iterator it = dn->replicas_begin();
it != dn->replicas_end();
if (mdr->dst_reanchor_atid)
mds->anchorclient->commit(mdr->dst_reanchor_atid);
+ // bump pop
+ //mds->balancer->hit_dir(mdr->now, dn->dir, META_POP_DWR);
+
// reply
MClientReply *reply = new MClientReply(mdr->client_request, 0);
reply_request(mdr, reply, dn->dir->get_inode()); // FIXME: imprecise ref
// ok, let's do it.
// prepare log entry
- EUpdate *le = new EUpdate("unlink_remote");
+ EUpdate *le = new EUpdate(mdlog, "unlink_remote");
le->metablob.add_client_req(mdr->reqid);
// the unlinked dentry
// log + wait
mdlog->submit_entry(le);
mdlog->wait_for_sync(fin);
-
- mds->balancer->hit_dir(dn->dir, META_POP_DWR);
}
void Server::_unlink_remote_finish(MDRequest *mdr,
// dir inode's mtime
dirty_dn_diri(dn, dirpv, mdr->now);
- // bump target popularity
- mds->balancer->hit_dir(dn->dir, META_POP_DWR);
-
// share unlink news with replicas
for (map<int,int>::iterator it = dn->replicas_begin();
it != dn->replicas_end();
if (mdr->dst_reanchor_atid)
mds->anchorclient->commit(mdr->dst_reanchor_atid);
+ //mds->balancer->hit_dir(mdr->now, dn->dir, META_POP_DWR);
+
// reply
MClientReply *reply = new MClientReply(mdr->client_request, 0);
reply_request(mdr, reply, dn->dir->get_inode()); // FIXME: imprecise ref
}
// -- prepare journal entry --
- EUpdate *le = new EUpdate("rename");
+ EUpdate *le = new EUpdate(mdlog, "rename");
le->metablob.add_client_req(mdr->reqid);
_rename_prepare(mdr, &le->metablob, srcdn, destdn, straydn);
if (mdr->src_reanchor_atid) mds->anchorclient->commit(mdr->src_reanchor_atid);
if (mdr->dst_reanchor_atid) mds->anchorclient->commit(mdr->dst_reanchor_atid);
+ // bump popularity
+ //if (srcdn->is_auth())
+ //mds->balancer->hit_dir(mdr->now, srcdn->get_dir(), META_POP_DWR);
+ // mds->balancer->hit_dir(mdr->now, destdn->get_dir(), META_POP_DWR);
+ if (destdn->is_remote() &&
+ destdn->inode->is_auth())
+ mds->balancer->hit_inode(mdr->now, destdn->get_inode(), META_POP_IWR);
+
// reply
MClientReply *reply = new MClientReply(mdr->client_request, 0);
- reply_request(mdr, reply, destdn->dir->get_inode()); // FIXME: imprecise ref
+ reply_request(mdr, reply, destdn->get_inode()); // FIXME: imprecise ref
// clean up?
if (straydn)
mdr->pvmap[srcdn->dir->inode] = predirty_dn_diri(mdr, srcdn, metablob);
}
- inode_t *ji = 0; // journaled inode getting nlink--
- version_t ipv; // it's version
+ inode_t *ji; // journaled inode getting nlink--
+ version_t ipv = 0; // it's version
if (linkmerge) {
dout(10) << "will merge remote+primary links" << endl;
}
}
- if (ji) {
+ if (ipv) {
// update journaled target inode
inode_t *pi = destdn->inode->project_inode();
pi->nlink--;
// move inode to dest
srcdn->dir->unlink_inode(srcdn);
destdn->dir->unlink_inode(destdn);
- destdn->dir->link_inode(destdn, oldin);
+ destdn->dir->link_primary_inode(destdn, oldin);
// nlink--
destdn->inode->inode.nlink--;
// relink oldin to stray dir. destdn was primary.
assert(oldin);
- straydn->dir->link_inode(straydn, oldin);
+ straydn->dir->link_primary_inode(straydn, oldin);
//assert(straypv == ipv);
// nlink-- in stray dir.
if (srcdn->is_remote()) {
// srcdn was remote.
srcdn->dir->unlink_inode(srcdn);
- destdn->dir->link_inode(destdn, in->ino());
+ destdn->dir->link_remote_inode(destdn, in->ino(), MODE_TO_DT(in->inode.mode));
if (destdn->is_auth())
destdn->mark_dirty(mdr->pvmap[destdn]);
} else {
// srcdn was primary.
srcdn->dir->unlink_inode(srcdn);
- destdn->dir->link_inode(destdn, in);
+ destdn->dir->link_primary_inode(destdn, in);
// srcdn inode import?
if (!srcdn->is_auth() && destdn->is_auth()) {
destdn->inode->is_auth() ||
srcdn->inode->is_any_caps()) {
// journal.
- ESlaveUpdate *le = new ESlaveUpdate("slave_rename_prep", mdr->reqid, mdr->slave_to_mds, ESlaveUpdate::OP_PREPARE);
+ ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rename_prep", mdr->reqid, mdr->slave_to_mds, ESlaveUpdate::OP_PREPARE);
_rename_prepare(mdr, &le->metablob, srcdn, destdn, straydn);
mds->mdlog->submit_entry(le, new C_MDS_SlaveRenamePrep(this, mdr, srcdn, destdn, straydn));
} else {
// set up commit waiter
mdr->slave_commit = new C_MDS_SlaveRenameCommit(this, mdr, srcdn, destdn, straydn);
+ // bump popularity
+ //if (srcdn->is_auth())
+ //mds->balancer->hit_dir(mdr->now, srcdn->get_dir(), META_POP_DWR);
+ if (destdn->inode->is_auth())
+ mds->balancer->hit_inode(mdr->now, destdn->inode, META_POP_IWR);
+
// done.
delete mdr->slave_request;
mdr->slave_request = 0;
_rename_apply(mdr, srcdn, destdn, straydn);
// write a commit to the journal
- le = new ESlaveUpdate("slave_rename_commit", mdr->reqid, mdr->slave_to_mds, ESlaveUpdate::OP_COMMIT);
+ le = new ESlaveUpdate(mdlog, "slave_rename_commit", mdr->reqid, mdr->slave_to_mds, ESlaveUpdate::OP_COMMIT);
} else {
// abort
- le = new ESlaveUpdate("slave_rename_abort", mdr->reqid, mdr->slave_to_mds, ESlaveUpdate::OP_ROLLBACK);
+ le = new ESlaveUpdate(mdlog, "slave_rename_abort", mdr->reqid, mdr->slave_to_mds, ESlaveUpdate::OP_ROLLBACK);
}
mds->mdlog->submit_entry(le);
}
map<int,entity_inst_t> exported_client_map;
bufferlist inodebl;
mdcache->migrator->encode_export_inode(mdr->srcdn->inode, inodebl, mdr->slave_to_mds,
- exported_client_map);
+ exported_client_map,
+ mdr->now);
::_encode(exported_client_map, reply->inode_export);
reply->inode_export.claim_append(inodebl);
in->inode.mtime = ctime;
in->mark_dirty(pv);
- // hit pop
- mds->balancer->hit_inode(in, META_POP_IWR);
-
// reply
mds->server->reply_request(mdr, 0);
}
pdv, req->args.truncate.length, ctime);
// log + wait
- EUpdate *le = new EUpdate("truncate");
+ EUpdate *le = new EUpdate(mdlog, "truncate");
le->metablob.add_client_req(mdr->reqid);
le->metablob.add_dir_context(cur->get_parent_dir());
le->metablob.add_inode_truncate(cur->inode, req->args.truncate.length);
<< " on " << *cur << endl;
// hit pop
+ mdr->now = g_clock.now();
if (cmode == FILE_MODE_RW ||
cmode == FILE_MODE_W)
- mds->balancer->hit_inode(cur, META_POP_IWR);
+ mds->balancer->hit_inode(mdr->now, cur, META_POP_IWR);
else
- mds->balancer->hit_inode(cur, META_POP_IRD);
+ mds->balancer->hit_inode(mdr->now, cur, META_POP_IRD);
// reply
MClientReply *reply = new MClientReply(req, 0);
++p) {
(*p)->put(CInode::PIN_BATCHOPENJOURNAL);
if ((*p)->is_any_caps()) {
- if (!le) le = new EOpen;
+ if (!le) le = new EOpen(mdlog);
le->add_inode(*p);
(*p)->last_open_journaled = mds->mdlog->get_write_pos();
}
in->inode.mtime = ctime;
in->mark_dirty(pv);
- // hit pop
- mds->balancer->hit_inode(in, META_POP_IWR);
-
// do the open
mds->server->_do_open(mdr, in);
}
void finish(int r) {
assert(r == 0);
+ // hit pop
+ mds->balancer->hit_inode(mdr->now, in, META_POP_IWR);
+
// purge also...
mds->mdcache->purge_inode(&in->inode, 0);
mds->mdcache->wait_for_purge(in->inode.ino, 0,
pdv, ctime);
// log + wait
- EUpdate *le = new EUpdate("open_truncate");
+ EUpdate *le = new EUpdate(mdlog, "open_truncate");
le->metablob.add_client_req(mdr->reqid);
le->metablob.add_dir_context(cur->get_parent_dir());
le->metablob.add_inode_truncate(cur->inode, 0);
assert(r == 0);
// link the inode
- dn->get_dir()->link_inode(dn, newi);
+ dn->get_dir()->link_primary_inode(dn, newi);
// dirty inode, dn, dir
newi->mark_dirty(pv);
mdr->ref = newi;
mdr->pin(newi);
- // hit pop
- mds->balancer->hit_inode(newi, META_POP_IWR);
-
// ok, do the open.
mds->server->handle_client_open(mdr);
}
// prepare finisher
C_MDS_openc_finish *fin = new C_MDS_openc_finish(mds, mdr, dn, in);
- EUpdate *le = new EUpdate("openc");
+ EUpdate *le = new EUpdate(mdlog, "openc");
le->metablob.add_client_req(req->get_reqid());
le->metablob.add_allocated_ino(in->ino(), mds->idalloc->get_version());
le->metablob.add_dir_context(dn->dir);
#include "MDS.h"
+class Logger;
class LogEvent;
class C_MDS_rename_finish;
class MDRequest;
MDCache *mdcache;
MDLog *mdlog;
Messenger *messenger;
+ Logger *logger;
public:
Server(MDS *m) :
mds(m),
mdcache(mds->mdcache), mdlog(mds->mdlog),
- messenger(mds->messenger) {
+ messenger(mds->messenger),
+ logger(0) {
}
+ ~Server() {
+ delete logger;
+ }
+
+ void reopen_logger();
// message handler
void dispatch(Message *m);
void handle_client_chmod(MDRequest *mdr);
void handle_client_chown(MDRequest *mdr);
void handle_client_readdir(MDRequest *mdr);
- int encode_dir_contents(CDir *dir, list<class InodeStat*>& inls, list<string>& dnls);
+ int encode_dir_contents(CDir *dir,
+ list<string>& dnls,
+ list<class InodeStat*>& inls);
void handle_client_truncate(MDRequest *mdr);
void handle_client_fsync(MDRequest *mdr);
case LOCK_OTYPE_ILINK: return "ilink";
case LOCK_OTYPE_IDIRFRAGTREE: return "idft";
case LOCK_OTYPE_IDIR: return "idir";
- default: assert(0);
+ default: assert(0); return 0;
}
}
case LOCK_LOCK: return "lock";
case LOCK_GLOCKR: return "glockr";
case LOCK_REMOTEXLOCK: return "remote_xlock";
- default: assert(0);
+ default: assert(0); return 0;
}
}
// lock state
int state;
- set<int32_t> gather_set; // auth
+ set<int> gather_set; // auth
// local state
int num_rdlock;
set<dirfrag_t> bounds;
public:
- EExport(CDir *dir) : LogEvent(EVENT_EXPORT),
- base(dir->dirfrag()) {
- metablob.add_dir_context(dir);
- }
EExport() : LogEvent(EVENT_EXPORT) { }
+ EExport(MDLog *mdlog, CDir *dir) :
+ LogEvent(EVENT_EXPORT), metablob(mdlog),
+ base(dir->dirfrag()) { }
set<dirfrag_t> &get_bounds() { return bounds; }
class EFragment : public LogEvent {
public:
+ EMetaBlob metablob;
inodeno_t ino;
frag_t basefrag;
int bits; // positive for split (from basefrag), negative for merge (to basefrag)
- EMetaBlob metablob;
EFragment() : LogEvent(EVENT_FRAGMENT) { }
- EFragment(inodeno_t i, frag_t bf, int b) :
- LogEvent(EVENT_FRAGMENT),
- ino(i), basefrag(bf), bits(b) { }
+ EFragment(MDLog *mdlog, inodeno_t i, frag_t bf, int b) :
+ LogEvent(EVENT_FRAGMENT), metablob(mdlog),
+ ino(i), basefrag(bf), bits(b) { }
void print(ostream& out) {
out << "EFragment " << ino << " " << basefrag << " by " << bits << " " << metablob;
}
void encode_payload(bufferlist& bl) {
- ::_encode(ino, bl);
- ::_encode(basefrag, bl);
+ ::_encode(ino, bl);
+ ::_encode(basefrag, bl);
::_encode(bits, bl);
- metablob._encode(bl);
+ metablob._encode(bl);
}
void decode_payload(bufferlist& bl, int& off) {
- ::_decode(ino, bl, off);
- ::_decode(basefrag, bl, off);
+ ::_decode(ino, bl, off);
+ ::_decode(basefrag, bl, off);
::_decode(bits, bl, off);
- metablob._decode(bl, off);
+ metablob._decode(bl, off);
}
bool has_expired(MDS *mds);
#include "../CDentry.h"
class MDS;
+class MDLog;
/*
* a bunch of metadata in the journal
string symlink;
bool dirty;
- fullbit(const string& d, version_t v, inode_t& i, bool dr) : dn(d), dnv(v), inode(i), dirty(dr) { }
- fullbit(const string& d, version_t v, inode_t& i, string& sym, bool dr) : dn(d), dnv(v), inode(i), symlink(sym), dirty(dr) { }
+ fullbit(const string& d, version_t v, inode_t& i, bool dr) :
+ dn(d), dnv(v), inode(i), dirty(dr) { }
+ fullbit(const string& d, version_t v, inode_t& i, string& sym, bool dr) :
+ dn(d), dnv(v), inode(i), symlink(sym), dirty(dr) { }
fullbit(bufferlist& bl, int& off) { _decode(bl, off); }
void _encode(bufferlist& bl) {
::_encode(dn, bl);
- bl.append((char*)&dnv, sizeof(dnv));
- bl.append((char*)&inode, sizeof(inode));
+ ::_encode(dnv, bl);
+ ::_encode(inode, bl);
if (inode.is_symlink())
::_encode(symlink, bl);
- bl.append((char*)&dirty, sizeof(dirty));
+ ::_encode(dirty, bl);
}
void _decode(bufferlist& bl, int& off) {
::_decode(dn, bl, off);
- bl.copy(off, sizeof(dnv), (char*)&dnv);
- off += sizeof(dnv);
- bl.copy(off, sizeof(inode), (char*)&inode);
- off += sizeof(inode);
+ ::_decode(dnv, bl, off);
+ ::_decode(inode, bl, off);
if (inode.is_symlink())
::_decode(symlink, bl, off);
- bl.copy(off, sizeof(dirty), (char*)&dirty);
- off += sizeof(dirty);
+ ::_decode(dirty, bl, off);
}
void print(ostream& out) {
out << " fullbit dn " << dn << " dnv " << dnv
string dn;
version_t dnv;
inodeno_t ino;
+ unsigned char d_type;
bool dirty;
- remotebit(const string& d, version_t v, inodeno_t i, bool dr) : dn(d), dnv(v), ino(i), dirty(dr) { }
+ remotebit(const string& d, version_t v, inodeno_t i, unsigned char dt, bool dr) :
+ dn(d), dnv(v), ino(i), d_type(dt), dirty(dr) { }
remotebit(bufferlist& bl, int& off) { _decode(bl, off); }
void _encode(bufferlist& bl) {
::_encode(dn, bl);
- bl.append((char*)&dnv, sizeof(dnv));
- bl.append((char*)&ino, sizeof(ino));
- bl.append((char*)&dirty, sizeof(dirty));
+ ::_encode(dnv, bl);
+ ::_encode(ino, bl);
+ ::_encode(d_type, bl);
+ ::_encode(dirty, bl);
}
void _decode(bufferlist& bl, int& off) {
::_decode(dn, bl, off);
- bl.copy(off, sizeof(dnv), (char*)&dnv);
- off += sizeof(dnv);
- bl.copy(off, sizeof(ino), (char*)&ino);
- off += sizeof(ino);
- bl.copy(off, sizeof(dirty), (char*)&dirty);
- off += sizeof(dirty);
+ ::_decode(dnv, bl, off);
+ ::_decode(ino, bl, off);
+ ::_decode(d_type, bl, off);
+ ::_decode(dirty, bl, off);
}
void print(ostream& out) {
out << " remotebit dn " << dn << " dnv " << dnv
nullbit(bufferlist& bl, int& off) { _decode(bl, off); }
void _encode(bufferlist& bl) {
::_encode(dn, bl);
- bl.append((char*)&dnv, sizeof(dnv));
- bl.append((char*)&dirty, sizeof(dirty));
+ ::_encode(dnv, bl);
+ ::_encode(dirty, bl);
}
void _decode(bufferlist& bl, int& off) {
::_decode(dn, bl, off);
- bl.copy(off, sizeof(dnv), (char*)&dnv);
- off += sizeof(dnv);
- bl.copy(off, sizeof(dirty), (char*)&dirty);
- off += sizeof(dirty);
+ ::_decode(dnv, bl, off);
+ ::_decode(dirty, bl, off);
}
void print(ostream& out) {
out << " nullbit dn " << dn << " dnv " << dnv
/* dirlump - contains metadata for any dir we have contents for.
*/
+public:
struct dirlump {
static const int STATE_COMPLETE = (1<<1);
static const int STATE_DIRTY = (1<<2); // dirty due to THIS journal item, that is!
::_encode(nremote, bl);
::_encode(nnull, bl);
_encode_bits();
- ::_encode(dnbl, bl);
+ ::_encode_destructively(dnbl, bl);
}
void _decode(bufferlist& bl, int& off) {
::_decode(dirv, bl, off);
dn_decoded = false; // don't decode bits unless we need them.
}
};
-
+
+private:
// my lumps. preserve the order we added them in a list.
list<dirfrag_t> lump_order;
map<dirfrag_t, dirlump> lump_map;
list<metareqid_t> client_reqs;
public:
+ // soft state
+ off_t last_subtree_map;
+ off_t my_offset;
+
+ EMetaBlob() : last_subtree_map(0), my_offset(0) { }
+ EMetaBlob(MDLog *mdl); // defined in journal.cc
+
void print(ostream& out) {
for (list<dirfrag_t>::iterator p = lump_order.begin();
p != lump_order.end();
}
void add_null_dentry(CDentry *dn, bool dirty) {
+ add_null_dentry(add_dir(dn->get_dir(), false), dn, dirty);
+ }
+ void add_null_dentry(dirlump& lump, CDentry *dn, bool dirty) {
// add the dir
- dirlump& lump = add_dir(dn->get_dir(), false);
-
lump.nnull++;
if (dirty)
lump.get_dnull().push_front(nullbit(dn->get_name(),
}
void add_remote_dentry(CDentry *dn, bool dirty, inodeno_t rino=0) {
- if (!rino)
+ add_remote_dentry(add_dir(dn->get_dir(), false),
+ dn, dirty, rino);
+ }
+ void add_remote_dentry(dirlump& lump, CDentry *dn, bool dirty,
+ inodeno_t rino=0, unsigned char rdt=0) {
+ if (!rino) {
rino = dn->get_remote_ino();
-
- dirlump& lump = add_dir(dn->get_dir(), false);
-
+ rdt = dn->get_remote_d_type();
+ }
lump.nremote++;
if (dirty)
lump.get_dremote().push_front(remotebit(dn->get_name(),
dn->get_projected_version(),
- rino,
+ rino, rdt,
dirty));
else
lump.get_dremote().push_back(remotebit(dn->get_name(),
dn->get_projected_version(),
- rino,
+ rino, rdt,
dirty));
}
// return remote pointer to to-be-journaled inode
- inode_t *add_primary_dentry(CDentry *dn, bool dirty, CInode *in=0, inode_t *pi=0, fragtree_t *pdft=0) {
- if (!in) in = dn->get_inode();
+ inode_t *add_primary_dentry(CDentry *dn, bool dirty,
+ CInode *in=0, inode_t *pi=0, fragtree_t *pdft=0) {
+ return add_primary_dentry(add_dir(dn->get_dir(), false),
+ dn, dirty, in, pi, pdft);
+ }
+ inode_t *add_primary_dentry(dirlump& lump, CDentry *dn, bool dirty,
+ CInode *in=0, inode_t *pi=0, fragtree_t *pdft=0) {
+ if (!in)
+ in = dn->get_inode();
- dirlump& lump = add_dir(dn->get_dir(), false);
+ // make note of where this inode was last journaled
+ in->last_journaled = my_offset;
lump.nfull++;
if (dirty) {
// convenience: primary or remote? figure it out.
inode_t *add_dentry(CDentry *dn, bool dirty) {
+ dirlump& lump = add_dir(dn->get_dir(), false);
+ return add_dentry(lump, dn, dirty);
+ }
+ inode_t *add_dentry(dirlump& lump, CDentry *dn, bool dirty) {
// primary or remote
if (dn->is_remote()) {
add_remote_dentry(dn, dirty);
dirlump& add_dir(CDir *dir, bool dirty, bool complete=false) {
- dirfrag_t df = dir->dirfrag();
+ return add_dir(dir->dirfrag(), dir->get_projected_version(), dirty, complete);
+ }
+ dirlump& add_dir(dirfrag_t df, version_t pv, bool dirty, bool complete=false) {
if (lump_map.count(df) == 0) {
lump_order.push_back(df);
- lump_map[df].dirv = dir->get_projected_version();
+ lump_map[df].dirv = pv;
}
dirlump& l = lump_map[df];
if (complete) l.mark_complete();
if (lump_map.count(dir->dirfrag()))
return;
- // stop at subtree root?
- if (mode == TO_AUTH_SUBTREE_ROOT &&
- dir->is_subtree_root() && dir->is_auth())
- return;
+ if (mode == TO_AUTH_SUBTREE_ROOT) {
+ // subtree root?
+ if (dir->is_subtree_root() && dir->is_auth())
+ return;
+ // was the inode journaled since the last subtree_map?
+ if (dir->inode->last_journaled >= last_subtree_map)
+ return;
+ }
// stop at root/stray
CInode *diri = dir->get_inode();
if (!diri->get_parent_dn())
return;
+ // journaled?
+
// add parent dn
CDentry *parent = diri->get_parent_dn();
add_dir_context(parent->get_dir(), mode);
list<inodeno_t> inos;
EOpen() : LogEvent(EVENT_OPEN) { }
- EOpen(CInode *in) : LogEvent(EVENT_OPEN) {
- add_inode(in);
- }
+ EOpen(MDLog *mdlog) :
+ LogEvent(EVENT_OPEN), metablob(mdlog) { }
+
void print(ostream& out) {
out << "EOpen " << metablob;
}
const static int OP_COMMIT = 2;
const static int OP_ROLLBACK = 3;
+ EMetaBlob metablob;
string type;
metareqid_t reqid;
int master;
int op; // prepare, commit, abort
- EMetaBlob metablob;
ESlaveUpdate() : LogEvent(EVENT_SLAVEUPDATE) { }
- ESlaveUpdate(const char *s, metareqid_t ri, int mastermds, int o) :
- LogEvent(EVENT_SLAVEUPDATE),
+ ESlaveUpdate(MDLog *mdlog, const char *s, metareqid_t ri, int mastermds, int o) :
+ LogEvent(EVENT_SLAVEUPDATE), metablob(mdlog),
type(s),
reqid(ri),
master(mastermds),
string type;
EUpdate() : LogEvent(EVENT_UPDATE) { }
- EUpdate(const char *s) : LogEvent(EVENT_UPDATE),
- type(s) { }
+ EUpdate(MDLog *mdlog, const char *s) :
+ LogEvent(EVENT_UPDATE), metablob(mdlog),
+ type(s) { }
void print(ostream& out) {
if (type.length())
// -----------------------
// EMetaBlob
+EMetaBlob::EMetaBlob(MDLog *mdlog) :
+ last_subtree_map(mdlog->get_last_subtree_map_offset()),
+ my_offset(mdlog->get_write_pos())
+{
+}
+
+
/*
* we need to ensure that a journaled item has either
*
<< " for " << *dir << endl;
continue; // not our problem
}
- if (dir->get_committed_version() >= lp->second.dirv) {
+ if (dir->get_committed_version() >= lp->second.dirv ||
+ dir->get_committed_version_equivalent() >= lp->second.dirv) {
dout(10) << "EMetaBlob.has_expired have dirv " << lp->second.dirv
<< " for " << *dir << endl;
continue; // yay
<< " for " << *dir << endl;
continue; // not our problem
}
- if (dir->get_committed_version() >= lp->second.dirv) {
+ if (dir->get_committed_version() >= lp->second.dirv ||
+ dir->get_committed_version_equivalent() >= lp->second.dirv) {
dout(10) << "EMetaBlob.expire have dirv " << lp->second.dirv
<< " on " << *dir << endl;
continue; // yay
continue;
}
}
- if (dir->get_committed_version() < lp->second.dirv) {
- dout(10) << "EMetaBlob.expire need dirv " << lp->second.dirv
- << ", committing " << *dir << endl;
- commit[dir] = MAX(commit[dir], lp->second.dirv);
- ncommit++;
- continue;
- }
-
- assert(0); // hrm
+
+ assert(dir->get_committed_version() < lp->second.dirv);
+ dout(10) << "EMetaBlob.expire need dirv " << lp->second.dirv
+ << ", committing " << *dir << endl;
+ commit[dir] = MAX(commit[dir], lp->second.dirv);
+ ncommit++;
}
// set up gather context
p++) {
CDentry *dn = dir->lookup(p->dn);
if (!dn) {
- dn = dir->add_dentry( p->dn );
+ dn = dir->add_null_dentry(p->dn);
dn->set_version(p->dnv);
if (p->dirty) dn->_mark_dirty();
dout(10) << "EMetaBlob.replay added " << *dn << endl;
in->inode = p->inode;
if (in->inode.is_symlink()) in->symlink = p->symlink;
mds->mdcache->add_inode(in);
- dir->link_inode(dn, in);
+ dir->link_primary_inode(dn, in);
if (p->dirty) in->_mark_dirty();
dout(10) << "EMetaBlob.replay added " << *in << endl;
} else {
}
in->inode = p->inode;
if (in->inode.is_symlink()) in->symlink = p->symlink;
- dir->link_inode(dn, in);
+ dir->link_primary_inode(dn, in);
if (p->dirty) in->_mark_dirty();
dout(10) << "EMetaBlob.replay linked " << *in << endl;
}
p++) {
CDentry *dn = dir->lookup(p->dn);
if (!dn) {
- dn = dir->add_dentry(p->dn, p->ino);
- dn->set_remote_ino(p->ino);
+ dn = dir->add_remote_dentry(p->dn, p->ino, p->d_type);
dn->set_version(p->dnv);
if (p->dirty) dn->_mark_dirty();
dout(10) << "EMetaBlob.replay added " << *dn << endl;
dout(10) << "EMetaBlob.replay unlinking " << *dn << endl;
dir->unlink_inode(dn);
}
- dn->set_remote_ino(p->ino);
+ dn->set_remote(p->ino, p->d_type);
dn->set_version(p->dnv);
if (p->dirty) dn->_mark_dirty();
dout(10) << "EMetaBlob.replay had " << *dn << endl;
p++) {
CDentry *dn = dir->lookup(p->dn);
if (!dn) {
- dn = dir->add_dentry(p->dn);
+ dn = dir->add_null_dentry(p->dn);
dn->set_version(p->dnv);
if (p->dirty) dn->_mark_dirty();
dout(10) << "EMetaBlob.replay added " << *dn << endl;
default:
assert(0);
+ return false;
}
}
CInode *in = mds->mdcache->get_inode(ino);
assert(in);
- //in->fragment_dir(basefrag, bits);
+ list<CDir*> resultfrags;
+ list<Context*> waiters;
+ mds->mdcache->adjust_dir_fragments(in, basefrag, bits, resultfrags, waiters);
+
metablob.replay(mds);
}
#define MDS_INO_ROOT 1
#define MDS_INO_PGTABLE 2
#define MDS_INO_ANCHORTABLE 3
+#define MDS_INO_PG 4 // this should match osd/osd_types.h PG_INO
#define MDS_INO_LOG_OFFSET 0x100
#define MDS_INO_IDS_OFFSET 0x200
#define MDS_INO_CLIENTMAP_OFFSET 0x300
struct metareqid_t {
- int client;
+ int32_t client;
tid_t tid;
metareqid_t() : client(-1), tid(0) {}
metareqid_t(int c, tid_t t) : client(c), tid(t) {}
dirfrag_t(inodeno_t i, frag_t f) : ino(i), frag(f) { }
};
-inline ostream& operator<<(ostream& out, const dirfrag_t& df) {
- return out << df.ino << "#" << df.frag;
+inline ostream& operator<<(ostream& out, const dirfrag_t df) {
+ out << df.ino;
+ if (!df.frag.is_root()) out << "." << df.frag;
+ return out;
}
inline bool operator<(dirfrag_t l, dirfrag_t r) {
if (l.ino < r.ino) return true;
// ================================================================
+#define META_POP_IRD 0
+#define META_POP_IWR 1
+#define META_POP_READDIR 2
+#define META_POP_FETCH 3
+#define META_POP_STORE 4
+#define META_NPOP 5
+
+class inode_load_vec_t {
+ static const int NUM = 2;
+ DecayCounter vec[NUM];
+public:
+ DecayCounter &get(int t) {
+ assert(t < NUM);
+ return vec[t];
+ }
+ void zero(utime_t now) {
+ for (int i=0; i<NUM; i++)
+ vec[i].reset(now);
+ }
+};
+
+class dirfrag_load_vec_t {
+public:
+ static const int NUM = 5;
+ DecayCounter vec[NUM];
+ DecayCounter &get(int t) {
+ assert(t < NUM);
+ return vec[t];
+ }
+ void adjust(utime_t now, double d) {
+ for (int i=0; i<NUM; i++)
+ vec[i].adjust(now, d);
+ }
+ void zero(utime_t now) {
+ for (int i=0; i<NUM; i++)
+ vec[i].reset(now);
+ }
+ double meta_load(utime_t now) {
+ return
+ 1*vec[META_POP_IRD].get(now) +
+ 2*vec[META_POP_IWR].get(now) +
+ 1*vec[META_POP_READDIR].get(now) +
+ 2*vec[META_POP_FETCH].get(now) +
+ 4*vec[META_POP_STORE].get(now);
+ }
+};
+
+inline dirfrag_load_vec_t& operator+=(dirfrag_load_vec_t& l, dirfrag_load_vec_t& r)
+{
+ for (int i=0; i<dirfrag_load_vec_t::NUM; i++)
+ l.vec[i].adjust(r.vec[i].get_last());
+ return l;
+}
+
+inline dirfrag_load_vec_t& operator-=(dirfrag_load_vec_t& l, dirfrag_load_vec_t& r)
+{
+ for (int i=0; i<dirfrag_load_vec_t::NUM; i++)
+ l.vec[i].adjust(-r.vec[i].get_last());
+ return l;
+}
+
+inline ostream& operator<<(ostream& out, dirfrag_load_vec_t& dl)
+{
+ return out << "[" << dl.vec[0].get() << "," << dl.vec[1].get() << "]";
+}
+
+
/* meta_load_t
* hierarchical load for an inode/dir and it's children
*/
-#define META_POP_IRD 0
-#define META_POP_IWR 1
-#define META_POP_DWR 2
-//#define META_POP_LOG 3
-//#define META_POP_FDIR 4
-//#define META_POP_CDIR 4
-#define META_NPOP 3
-
+/*
class meta_load_t {
public:
DecayCounter pop[META_NPOP];
- double meta_load() {
- return pop[META_POP_IRD].get() + 2*pop[META_POP_IWR].get();
+ double meta_load(utime_t now) {
+ return
+ pop[META_POP_IRD].get(now) +
+ 2*(pop[META_POP_IWR].get(now));
}
void take(meta_load_t& other) {
inline ostream& operator<<( ostream& out, meta_load_t& load )
{
- return out << "<rd " << load.pop[META_POP_IRD].get()
- << ", wr " << load.pop[META_POP_IWR].get()
- << ">";
+ return out << "<rwd "
+ << load.pop[META_POP_IRD].get() << "/"
+ << load.pop[META_POP_IWR].get()
+ << " "
+ << load.pop[META_POP_IRD].get_last_vel() << "/"
+ << load.pop[META_POP_IWR].get_last_vel()
+ << ">";
}
l.pop[i].adjust(r.pop[i].get());
return l;
}
-
+*/
/* mds_load_t
// popularity classes
#define MDS_POP_JUSTME 0 // just me (this dir or inode)
#define MDS_POP_NESTED 1 // me + children, auth or not
-#define MDS_POP_CURDOM 2 // me + children in current auth domain
-#define MDS_POP_ANYDOM 3 // me + children in any (nested) auth domain
-//#define MDS_POP_DIRMOD 4 // just this dir, modifications only
+#define MDS_POP_CURDOM 2 // (if auth) me + children in current auth domain
+#define MDS_POP_ANYDOM 3 // (if auth) me + children in any (nested) auth domain
#define MDS_NPOP 4
class mds_load_t {
public:
- meta_load_t root;
+ dirfrag_load_vec_t auth;
+ dirfrag_load_vec_t all;
double req_rate;
double cache_hit_rate;
mds_load_t() :
req_rate(0), cache_hit_rate(0), queue_len(0) { }
- double mds_load() {
+ double mds_load(utime_t now) {
switch(g_conf.mds_bal_mode) {
case 0:
- return root.meta_load()
- + req_rate
- + 10.0*queue_len;
+ return
+ .8 * auth.meta_load(now) +
+ .2 * all.meta_load(now) +
+ req_rate +
+ 10.0 * queue_len;
case 1:
return req_rate + 10.0*queue_len;
inline ostream& operator<<( ostream& out, mds_load_t& load )
{
- return out << "mdsload<" << load.root
+ return out << "mdsload<" << load.auth << "/" << load.all
<< ", req " << load.req_rate
<< ", hr " << load.cache_hit_rate
<< ", qlen " << load.queue_len
const static int PIN_REQUEST = -1003;
const static int PIN_WAITER = 1004;
const static int PIN_DIRTYSCATTERED = 1005;
-
+ static const int PIN_AUTHPIN = 1006;
+
const char *generic_pin_name(int p) {
switch (p) {
case PIN_REPLICATED: return "replicated";
case PIN_REQUEST: return "request";
case PIN_WAITER: return "waiter";
case PIN_DIRTYSCATTERED: return "dirtyscattered";
- default: assert(0);
+ case PIN_AUTHPIN: return "authpin";
+ default: assert(0); return 0;
}
}
*/
class InodeStat {
-
public:
inode_t inode;
string symlink; // symlink content (if symlink)
fragtree_t dirfragtree;
+ uint32_t mask;
// mds distribution hints
map<frag_t,int> dirfrag_auth;
public:
InodeStat() {}
InodeStat(CInode *in, int whoami) :
- inode(in->inode)
+ inode(in->inode),
+ mask(STAT_MASK_INO|STAT_MASK_TYPE|STAT_MASK_BASE)
{
- // inode.mask
- inode.mask = INODE_MASK_BASE;
- if (in->authlock.can_rdlock(0)) inode.mask |= INODE_MASK_AUTH;
- if (in->linklock.can_rdlock(0)) inode.mask |= INODE_MASK_LINK;
- if (in->filelock.can_rdlock(0)) inode.mask |= INODE_MASK_FILE;
+ // mask
+ if (in->authlock.can_rdlock(0)) mask |= STAT_MASK_AUTH;
+ if (in->linklock.can_rdlock(0)) mask |= STAT_MASK_LINK;
+ if (in->filelock.can_rdlock(0)) mask |= STAT_MASK_FILE;
// symlink content?
if (in->is_symlink())
}
void _encode(bufferlist &bl) {
+ ::_encode(mask, bl);
::_encode(inode, bl);
::_encode(dirfrag_auth, bl);
::_encode(dirfrag_dist, bl);
}
void _decode(bufferlist &bl, int& off) {
+ ::_decode(mask, bl, off);
::_decode(inode, bl, off);
::_decode(dirfrag_auth, bl, off);
::_decode(dirfrag_dist, bl, off);
list<InodeStat*> trace_in;
list<string> trace_dn;
+ list<string> dir_dn;
list<InodeStat*> dir_in;
- list<string> dir_dn;
public:
long get_tid() { return st.tid; }
trace_in.push_back(ci);
}
+ // dir contents
+ ::_decode(dir_dn, payload, off);
for (int i=0; i<st._dir_size; ++i) {
InodeStat *ci = new InodeStat;
ci->_decode(payload, off);
dir_in.push_back(ci);
- string dn;
- ::_decode(dn, payload, off);
- dir_dn.push_back(dn);
}
}
virtual void encode_payload() {
}
// dir contents
- pdn = dir_dn.begin();
+ ::_encode(dir_dn, payload);
for (pin = dir_in.begin();
pin != dir_in.end();
- ++pin, ++pdn) {
+ ++pin)
(*pin)->_encode(payload);
- ::_encode(*pdn, payload);
- }
}
// builders
dir_in.push_back(in);
++st._dir_size;
}*/
- void take_dir_items(list<InodeStat*>& inls,
- list<string>& dnls,
- int num) {
- dir_in.swap(inls);
+ void take_dir_items(list<string>& dnls,
+ list<InodeStat*>& inls,
+ int num) {
dir_dn.swap(dnls);
+ dir_in.swap(inls);
st._dir_size = num;
}
+ /*
void copy_dir_items(const list<InodeStat*>& inls,
const list<string>& dnls) {
list<string>::const_iterator pdn = dnls.begin();
++st._dir_size;
}
}
+ */
void set_trace_dist(CInode *in, int whoami) {
st._num_trace_in = 0;
} chown;
struct {
mode_t mode;
+ dev_t rdev;
} mknod;
struct {
mode_t mode;
default:
assert(0);
+ return false;
}
}
case OP_OPEN: return "open";
case OP_REQUEST_CLOSE: return "request_close";
case OP_CLOSE: return "close";
- default: assert(0);
+ default: assert(0); return 0;
}
}
list<dirfrag_t> bounds;
list<CInodeDiscover*> inodes;
+ list<CDentryDiscover*> dentries;
map<inodeno_t,dirfrag_t> inode_dirfrag;
map<inodeno_t,string> inode_dentry;
dirfrag_t get_dirfrag() { return dirfrag; }
list<dirfrag_t>& get_bounds() { return bounds; }
list<CInodeDiscover*>& get_inodes() { return inodes; }
+ list<CDentryDiscover*>& get_dentries() { return dentries; }
list<frag_t>& get_inode_dirfrags(inodeno_t ino) {
return frags_by_ino[ino];
}
iit != inodes.end();
iit++)
delete *iit;
+ for (list<CDentryDiscover*>::iterator p = dentries.begin();
+ p != dentries.end();
+ p++)
+ delete *p;
for (map<dirfrag_t,CDirDiscover*>::iterator dit = dirfrags.begin();
dit != dirfrags.end();
dit++)
void add_export(dirfrag_t df) {
bounds.push_back( df );
}
- void add_inode(dirfrag_t df, const string& dentry, CInodeDiscover *in) {
+ void add_inode(dirfrag_t df, const string& name, CDentryDiscover *dn, CInodeDiscover *in) {
inodes.push_back(in);
+ dentries.push_back(dn);
inode_dirfrag[in->get_ino()] = df;
- inode_dentry[in->get_ino()] = dentry;
+ inode_dentry[in->get_ino()] = name;
}
void add_dirfrag(CDirDiscover *dir) {
dirfrags[dir->get_dirfrag()] = dir;
CInodeDiscover *in = new CInodeDiscover;
in->_decode(payload, off);
inodes.push_back(in);
+
+ // dentry
+ CDentryDiscover *dn = new CDentryDiscover;
+ dn->_decode(payload, off);
+ dentries.push_back(dn);
// dentry
string d;
// inodes
int ni = inodes.size();
payload.append((char*)&ni, sizeof(int));
- for (list<CInodeDiscover*>::iterator iit = inodes.begin();
- iit != inodes.end();
- iit++) {
+ list<CDentryDiscover*>::iterator dit = dentries.begin();
+ list<CInodeDiscover*>::iterator iit = inodes.begin();
+ while (iit != inodes.end()) {
(*iit)->_encode(payload);
-
- // dentry
+ (*dit)->_encode(payload);
+
+ // dentry name
_encode(inode_dentry[(*iit)->get_ino()], payload);
// dir ino
// child frags
::_encode(frags_by_ino[(*iit)->get_ino()], payload);
+
+ iit++;
+ dit++;
}
// dirs
case LOCK_AC_MIXEDACK: return "mixedack";
case LOCK_AC_LOCKACK: return "lockack";
case LOCK_AC_REQSCATTER: return "reqscatter";
- default: assert(0);
+ default: assert(0); return 0;
}
}
class MMDSBeacon : public Message {
entity_inst_t inst;
+ epoch_t last_epoch_seen; // include last mdsmap epoch mds has seen to avoid race with monitor decree
int state;
version_t seq;
public:
MMDSBeacon() : Message(MSG_MDS_BEACON) {}
- MMDSBeacon(entity_inst_t i, int st, version_t se) :
+ MMDSBeacon(entity_inst_t i, epoch_t les, int st, version_t se) :
Message(MSG_MDS_BEACON),
- inst(i), state(st), seq(se) { }
+ inst(i), last_epoch_seen(les), state(st), seq(se) { }
entity_inst_t& get_mds_inst() { return inst; }
+ epoch_t get_last_epoch_seen() { return last_epoch_seen; }
int get_state() { return state; }
version_t get_seq() { return seq; }
char *get_type_name() { return "mdsbeacon"; }
void encode_payload() {
::_encode(inst, payload);
+ ::_encode(last_epoch_seen, payload);
::_encode(state, payload);
::_encode(seq, payload);
}
void decode_payload() {
int off = 0;
::_decode(inst, payload, off);
+ ::_decode(last_epoch_seen, payload, off);
::_decode(state, payload, off);
::_decode(seq, payload, off);
}
case OP_ACK: return "ack";
case OP_MISSING: return "missing";
case OP_FULL: return "full";
- default: assert(0);
+ default: assert(0); return 0;
}
}
struct dn_strong {
inodeno_t ino;
inodeno_t remote_ino;
+ unsigned char remote_d_type;
int32_t nonce;
int32_t lock;
- dn_strong() : ino(0), remote_ino(0), nonce(0), lock(0) {}
- dn_strong(inodeno_t pi, inodeno_t ri, int n, int l) :
- ino(pi), remote_ino(ri), nonce(n), lock(l) {}
+ dn_strong() :
+ ino(0), remote_ino(0), remote_d_type(0), nonce(0), lock(0) {}
+ dn_strong(inodeno_t pi, inodeno_t ri, unsigned char rdt, int n, int l) :
+ ino(pi), remote_ino(ri), remote_d_type(rdt), nonce(n), lock(l) {}
bool is_primary() { return ino > 0; }
bool is_remote() { return remote_ino > 0; }
bool is_null() { return ino == 0 && remote_ino == 0; }
struct dn_weak {
inodeno_t ino;
inodeno_t remote_ino;
- dn_weak() : ino(0), remote_ino(0) {}
- dn_weak(inodeno_t pi, inodeno_t ri) : ino(pi), remote_ino(ri) {}
+ unsigned char remote_d_type;
+ dn_weak() :
+ ino(0), remote_ino(0), remote_d_type(0) {}
+ dn_weak(inodeno_t pi, inodeno_t ri, unsigned char rdt) :
+ ino(pi), remote_ino(ri), remote_d_type(rdt) {}
bool is_primary() { return ino > 0; }
bool is_remote() { return remote_ino > 0; }
bool is_null() { return ino == 0 && remote_ino == 0; }
weak[df][dname] = dnw;
}
void add_weak_null_dentry(dirfrag_t df, const string& dname) {
- weak[df][dname] = dn_weak(0, 0);
+ weak[df][dname] = dn_weak(0, 0, 0);
}
void add_weak_primary_dentry(dirfrag_t df, const string& dname, inodeno_t ino) {
- weak[df][dname] = dn_weak(ino, 0);
+ weak[df][dname] = dn_weak(ino, 0, 0);
}
- void add_weak_remote_dentry(dirfrag_t df, const string& dname, inodeno_t ino) {
- weak[df][dname] = dn_weak(0, ino);
+ void add_weak_remote_dentry(dirfrag_t df, const string& dname, inodeno_t ino, unsigned char rdt) {
+ weak[df][dname] = dn_weak(0, ino, rdt);
}
- void add_strong_dentry(dirfrag_t df, const string& dname, inodeno_t pi, inodeno_t ri, int n, int ls) {
- strong_dentries[df][dname] = dn_strong(pi, ri, n, ls);
+ void add_strong_dentry(dirfrag_t df, const string& dname, inodeno_t pi, inodeno_t ri, unsigned char rdt, int n, int ls) {
+ strong_dentries[df][dname] = dn_strong(pi, ri, rdt, n, ls);
}
void add_dentry_authpin(dirfrag_t df, const string& dname, const metareqid_t& ri) {
authpinned_dentries[df][dname] = ri;
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef __MMDSFRAGMENTNOTIFY_H
+#define __MMDSFRAGMENTNOTIFY_H
+
+#include "msg/Message.h"
+#include <string>
+using namespace std;
+
+class MMDSFragmentNotify : public Message {
+ inodeno_t ino;
+ frag_t basefrag;
+ int8_t bits;
+
+ public:
+ inodeno_t get_ino() { return ino; }
+ frag_t get_basefrag() { return basefrag; }
+ int get_bits() { return bits; }
+
+ bufferlist basebl;
+
+ MMDSFragmentNotify() {}
+ MMDSFragmentNotify(inodeno_t i, frag_t bf, int b) :
+ Message(MSG_MDS_FRAGMENTNOTIFY),
+ ino(i), basefrag(bf), bits(b) { }
+
+ virtual char *get_type_name() { return "fragment_notify"; }
+ void print(ostream& o) {
+ o << "fragment_notify(" << ino << "#" << basefrag
+ << " " << (int)bits << ")";
+ }
+
+ virtual void decode_payload() {
+ int off = 0;
+ ::_decode(ino, payload, off);
+ ::_decode(basefrag, payload, off);
+ ::_decode(bits, payload, off);
+ ::_decode(basebl, payload, off);
+ }
+ virtual void encode_payload() {
+ ::_encode(ino, payload);
+ ::_encode(basefrag, payload);
+ ::_encode(bits, payload);
+ ::_encode(basebl, payload);
+ }
+};
+
+#endif
eversion_t pg_trim_to; // primary->replica: trim to here
- int op;
- size_t length;
- off_t offset;
+ int32_t op;
+ off_t offset, length;
eversion_t version;
eversion_t old_version;
return st.op < 10;
}
- const size_t get_length() { return st.length; }
+ const off_t get_length() { return st.length; }
const off_t get_offset() { return st.offset; }
map<string,bufferptr>& get_attrset() { return attrset; }
bufferlist& get_data() {
return data;
}
- size_t get_data_len() { return data.length(); }
+ off_t get_data_len() { return data.length(); }
MOSDOp(entity_inst_t asker, int inc, long tid,
void set_layout(const ObjectLayout& l) { st.layout = l; }
- void set_length(size_t l) { st.length = l; }
+ void set_length(off_t l) { st.length = l; }
void set_offset(off_t o) { st.offset = o; }
void set_version(eversion_t v) { st.version = v; }
void set_old_version(eversion_t ov) { st.old_version = ov; }
out << "osd_op(" << st.reqid
<< " " << get_opname(st.op)
<< " " << st.oid;
+ if (st.length) out << " " << st.offset << "~" << st.length;
if (st.retry_attempt) out << " RETRY";
out << ")";
}
object_t oid;
ObjectLayout layout; // pgid, etc.
- int op;
+ int32_t op;
// reply
- int result;
+ int32_t result;
bool commit;
- size_t length, offset;
- size_t object_size;
+ off_t length, offset;
+ off_t object_size;
eversion_t version;
eversion_t pg_complete_thru;
bool get_commit() { return st.commit; }
int get_result() { return st.result; }
- size_t get_length() { return st.length; }
- size_t get_offset() { return st.offset; }
- size_t get_object_size() { return st.object_size; }
+ off_t get_length() { return st.length; }
+ off_t get_offset() { return st.offset; }
+ off_t get_object_size() { return st.object_size; }
eversion_t get_version() { return st.version; }
map<string,bufferptr>& get_attrset() { return attrset; }
void set_pg_complete_thru(eversion_t v) { st.pg_complete_thru = v; }
void set_result(int r) { st.result = r; }
- void set_length(size_t s) { st.length = s; }
- void set_offset(size_t o) { st.offset = o; }
- void set_object_size(size_t s) { st.object_size = s; }
+ void set_length(off_t s) { st.length = s; }
+ void set_offset(off_t o) { st.offset = o; }
+ void set_object_size(off_t s) { st.object_size = s; }
void set_version(eversion_t v) { st.version = v; }
void set_attrset(map<string,bufferptr> &as) { attrset = as; }
out << "osd_op_reply(" << st.reqid
<< " " << MOSDOp::get_opname(st.op)
<< " " << st.oid;
+ if (st.length) out << " " << st.offset << "~" << st.length;
if (st.commit)
out << " commit";
else
}
char *get_type_name() { return "PGlog"; }
+ void print(ostream& out) {
+ out << "pg_log(" << pgid << " e" << epoch << ")";
+ }
void encode_payload() {
payload.append((char*)&epoch, sizeof(epoch));
}
char *get_type_name() { return "PGsum"; }
+ void print(ostream& out) {
+ out << "pg_summary(" << pgid << " e" << epoch << ")";
+ }
void encode_payload() {
payload.append((char*)&epoch, sizeof(epoch));
}
char *get_type_name() { return "PGUp"; }
+ void print(ostream& out) {
+ out << "pg_update(" << pgid << " e" << map_version;
+ if (complete) out << " complete";
+ out << " lac=" << last_any_complete;
+ out << ")";
+ }
void encode_payload() {
payload.append((char*)&map_version, sizeof(map_version));
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __MPGSTATS_H
+#define __MPGSTATS_H
+
+#include "osd/osd_types.h"
+
+class MPGStats : public Message {
+public:
+ map<pg_t,pg_stat_t> pg_stat;
+
+ MPGStats() : Message(MSG_PGSTATS) {}
+
+ char *get_type_name() { return "pg_stats"; }
+ void print(ostream& out) {
+ out << "pg_stats" << endl;
+ }
+
+ void encode_payload() {
+ ::_encode(pg_stat, payload);
+ }
+ void decode_payload() {
+ int off = 0;
+ ::_decode(pg_stat, payload, off);
+ }
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __MSTATFS_H
+#define __MSTATFS_H
+
+#include <sys/statvfs.h> /* or <sys/statfs.h> */
+
+class MStatfs : public Message {
+public:
+ struct statvfs stfs;
+
+ MStatfs() : Message(MSG_STATFS) {}
+
+ char *get_type_name() { return "statfs"; }
+ void print(ostream& out) {
+ out << "statfs" << endl;
+ }
+
+ void encode_payload() {
+ ::_encode(stfs, payload);
+ }
+ void decode_payload() {
+ int off = 0;
+ ::_decode(stfs, payload, off);
+ }
+};
+
+#endif
dout(10) << "encode_pending v " << pending_inc.version
<< ", next is " << pending_inc.next_client
<< endl;
-
assert(paxos->get_version() + 1 == pending_inc.version);
pending_inc._encode(bl);
}
// reply to beacon?
if (state != MDSMap::STATE_STOPPED) {
last_beacon[from] = g_clock.now(); // note time
- mon->messenger->send_message(new MMDSBeacon(m->get_mds_inst(), state, seq),
+ mon->messenger->send_message(new MMDSBeacon(m->get_mds_inst(), mdsmap.get_epoch(), state, seq),
m->get_mds_inst());
}
// is there a state change here?
- if (mdsmap.mds_state.count(from) == 0 ||
- mdsmap.mds_state[from] != state)
- return false; // yep, need to update map.
+ if (mdsmap.mds_state.count(from) == 0) {
+ if (state == MDSMap::STATE_BOOT)
+ return false; // need to add to map
+ dout(1) << "mds_beacon " << *m << " announcing non-boot state, ignoring" << endl;
+ } else if (mdsmap.mds_state[from] != state) {
+ if (mdsmap.get_epoch() == m->get_last_epoch_seen())
+ return false; // need to update map
+ dout(10) << "mds_beacon " << *m << " ignoring requested state, because mds hasn't seen latest map" << endl;
+ }
// we're done.
delete m;
// assign a name.
if (from >= 0) {
// wants to be (or already is) a specific MDS.
- if (mdsmap.is_failed(from)) {
- dout(10) << "mds_beacon boot: mds" << from << " was failed, replaying" << endl;
- state = MDSMap::STATE_REPLAY;
- } else if (mdsmap.is_stopped(from)) {
- dout(10) << "mds_beacon boot: mds" << from << " was stopped, starting" << endl;
- state = MDSMap::STATE_STARTING;
- } else if (!mdsmap.have_inst(from) || mdsmap.get_inst(from) != m->get_mds_inst()) {
+ if (!g_conf.mon_allow_mds_bully &&
+ (!mdsmap.have_inst(from) || mdsmap.get_inst(from) != m->get_mds_inst())) {
dout(10) << "mds_beacon boot: mds" << from << " is someone else" << endl;
from = -1;
+ } else {
+ switch (mdsmap.get_state(from)) {
+ case MDSMap::STATE_STOPPED:
+ case MDSMap::STATE_STARTING:
+ case MDSMap::STATE_STANDBY:
+ state = MDSMap::STATE_STARTING;
+ break;
+ case MDSMap::STATE_DNE:
+ case MDSMap::STATE_CREATING:
+ state = MDSMap::STATE_CREATING;
+ break;
+ case MDSMap::STATE_FAILED:
+ default:
+ state = MDSMap::STATE_REPLAY;
+ break;
+ }
+ dout(10) << "mds_beacon boot: mds" << from
+ << " was " << MDSMap::get_state_name(mdsmap.get_state(from))
+ << ", " << MDSMap::get_state_name(state)
+ << endl;
}
}
if (from < 0) {
print_map(mdsmap);
for (map<int,int>::iterator p = mdsmap.mds_state.begin();
p != mdsmap.mds_state.end();
- ++p)
- if (mdsmap.is_active(p->first))
+ ++p) {
+ switch (p->second) {
+ case MDSMap::STATE_ACTIVE:
+ case MDSMap::STATE_STOPPING:
pending_mdsmap.mds_state[p->first] = MDSMap::STATE_STOPPING;
+ break;
+ case MDSMap::STATE_CREATING:
+ case MDSMap::STATE_STANDBY:
+ pending_mdsmap.mds_state[p->first] = MDSMap::STATE_DNE;
+ break;
+ case MDSMap::STATE_STARTING:
+ pending_mdsmap.mds_state[p->first] = MDSMap::STATE_STOPPED;
+ break;
+ case MDSMap::STATE_REPLAY:
+ case MDSMap::STATE_RESOLVE:
+ case MDSMap::STATE_RECONNECT:
+ case MDSMap::STATE_REJOIN:
+ // BUG: hrm, if this is the case, the STOPPING gusy won't be able to stop, will they?
+ pending_mdsmap.mds_state[p->first] = MDSMap::STATE_FAILED;
+ break;
+ }
+ }
propose_pending();
}
string old = dir;
char *cwd = get_current_dir_name();
dir = cwd;
- delete cwd;
+ free(cwd);
dir += "/";
dir += old;
}
#include "osd/osd_types.h"
class PGMap {
-
public:
+ // the map
+ version_t version;
+ hash_map<pg_t,pg_stat_t> pg_stat;
+
class Incremental {
+ public:
+ version_t version;
+ map<pg_t,pg_stat_t> pg_stat_updates;
+ void _encode(bufferlist &bl) {
+ ::_encode(version, bl);
+ ::_encode(pg_stat_updates, bl);
+ }
+ void _decode(bufferlist& bl, int& off) {
+ ::_decode(version, bl, off);
+ ::_decode(pg_stat_updates, bl, off);
+ }
};
+ void apply_incremental(Incremental& inc) {
+ assert(inc.version == version+1);
+ version++;
+ for (map<pg_t,pg_stat_t>::iterator p = inc.pg_stat_updates.begin();
+ p != inc.pg_stat_updates.end();
+ ++p) {
+ if (pg_stat.count(p->first))
+ stat_sub(pg_stat[p->first]);
+ pg_stat[p->first] = p->second;
+ stat_add(p->second);
+ }
+ }
+
+ // aggregate stats (soft state)
+ hash_map<int,int> num_pg_by_state;
+ int64_t num_pg;
+ int64_t total_size;
+ int64_t total_num_blocks;
+
+ void stat_zero() {
+ num_pg = 0;
+ num_pg_by_state.clear();
+ total_size = 0;
+ total_num_blocks = 0;
+ }
+ void stat_add(pg_stat_t &s) {
+ num_pg++;
+ num_pg_by_state[s.state]++;
+ total_size += s.size;
+ total_num_blocks += s.num_blocks;
+ }
+ void stat_sub(pg_stat_t &s) {
+ num_pg--;
+ num_pg_by_state[s.state]--;
+ total_size -= s.size;
+ total_num_blocks -= s.num_blocks;
+ }
+
+ PGMap() : version(0),
+ num_pg(0), total_size(0), total_num_blocks(0) {}
+ void _encode(bufferlist &bl) {
+ ::_encode(version, bl);
+ ::_encode(pg_stat, bl);
+ }
+ void _decode(bufferlist& bl, int& off) {
+ ::_decode(version, bl, off);
+ ::_decode(pg_stat, bl, off);
+ stat_zero();
+ for (hash_map<pg_t,pg_stat_t>::iterator p = pg_stat.begin();
+ p != pg_stat.end();
+ ++p)
+ stat_add(p->second);
+ }
};
#endif
#include "OSDMonitor.h"
#include "MonitorStore.h"
+#include "messages/MPGStats.h"
+#include "messages/MStatfs.h"
+
#include "common/Timer.h"
#include "config.h"
void PGMonitor::create_initial()
{
+ dout(1) << "create_initial -- creating initial map" << endl;
}
bool PGMonitor::update_from_paxos()
{
+ version_t paxosv = paxos->get_version();
+ if (paxosv == pg_map.version) return true;
+ assert(paxosv >= pg_map.version);
+
+ if (pg_map.version == 0 && paxosv > 1 &&
+ mon->store->exists_bl_ss("pgmap","latest")) {
+ // starting up: load latest
+ dout(7) << "update_from_paxos startup: loading latest full pgmap" << endl;
+ bufferlist bl;
+ mon->store->get_bl_ss(bl, "pgmap", "latest");
+ int off = 0;
+ pg_map._decode(bl, off);
+ }
+
+ // walk through incrementals
+ while (paxosv > pg_map.version) {
+ bufferlist bl;
+ bool success = paxos->read(pg_map.version+1, bl);
+ if (success) {
+ dout(7) << "update_from_paxos applying incremental " << pg_map.version+1 << endl;
+ PGMap::Incremental inc;
+ int off = 0;
+ inc._decode(bl, off);
+ pg_map.apply_incremental(inc);
+
+ } else {
+ dout(7) << "update_from_paxos couldn't read incremental " << pg_map.version+1 << endl;
+ return false;
+ }
+ }
+
+ // save latest
+ bufferlist bl;
+ pg_map._encode(bl);
+ mon->store->put_bl_ss(bl, "pgmap", "latest");
+
return true;
}
void PGMonitor::create_pending()
{
-
+ pending_inc = PGMap::Incremental();
+ pending_inc.version = pg_map.version + 1;
+ dout(10) << "create_pending v " << pending_inc.version << endl;
}
void PGMonitor::encode_pending(bufferlist &bl)
{
-
+ assert(mon->is_leader());
+ dout(10) << "encode_pending v " << pending_inc.version << endl;
+ assert(paxos->get_version() + 1 == pending_inc.version);
+ pending_inc._encode(bl);
}
bool PGMonitor::preprocess_query(Message *m)
{
- return true;
+ dout(10) << "preprocess_query " << *m << " from " << m->get_source_inst() << endl;
+
+ switch (m->get_type()) {
+ case MSG_STATFS:
+ handle_statfs((MStatfs*)m);
+ return true;
+
+ case MSG_PGSTATS:
+ {
+ MPGStats *stats = (MPGStats*)m;
+ for (map<pg_t,pg_stat_t>::iterator p = stats->pg_stat.begin();
+ p != stats->pg_stat.end();
+ p++) {
+ if (pg_map.pg_stat.count(p->first) == 0 ||
+ pg_map.pg_stat[p->first].reported < p->second.reported)
+ return false;
+ }
+ dout(10) << " message contains no new pg stats" << endl;
+ return true;
+ }
+
+ default:
+ assert(0);
+ delete m;
+ return true;
+ }
}
bool PGMonitor::prepare_update(Message *m)
{
+ dout(10) << "prepare_update " << *m << " from " << m->get_source_inst() << endl;
+ switch (m->get_type()) {
+ case MSG_PGSTATS:
+ return handle_pg_stats((MPGStats*)m);
+
+ default:
+ assert(0);
+ delete m;
+ return false;
+ }
+}
+
+
+void PGMonitor::handle_statfs(MStatfs *statfs)
+{
+ dout(10) << "handle_statfs " << *statfs << " from " << statfs->get_source() << endl;
+
+ // fill out stfs
+ memset(&statfs->stfs, 0, sizeof(statfs->stfs));
+ statfs->stfs.f_blocks = pg_map.total_num_blocks;
+ statfs->stfs.f_fsid = 0; // hmm.
+ statfs->stfs.f_flag = ST_NOATIME|ST_NODIRATIME; // for now.
+
+ // reply
+ mon->messenger->send_message(statfs, statfs->get_source_inst());
+}
+
+bool PGMonitor::handle_pg_stats(MPGStats *stats)
+{
+ dout(10) << "handle_pg_stats " << *stats << " from " << stats->get_source() << endl;
+
+ for (map<pg_t,pg_stat_t>::iterator p = stats->pg_stat.begin();
+ p != stats->pg_stat.end();
+ p++) {
+ pg_t pgid;
+ if ((pg_map.pg_stat.count(pgid) &&
+ pg_map.pg_stat[pgid].reported >= p->second.reported)) {
+ dout(15) << " had " << pgid << " from " << pg_map.pg_stat[pgid].reported << endl;
+ continue;
+ }
+ if (pending_inc.pg_stat_updates.count(pgid) &&
+ pending_inc.pg_stat_updates[pgid].reported >= p->second.reported) {
+ dout(15) << " had " << pgid << " from " << pending_inc.pg_stat_updates[pgid].reported
+ << " (pending)" << endl;
+ continue;
+ }
+
+ dout(15) << " got " << pgid << " reported at " << p->second.reported << endl;
+ pending_inc.pg_stat_updates[pgid] = p->second;
+
+ // we don't care about consistency; apply to live map.
+ if (pg_map.pg_stat.count(pgid))
+ pg_map.stat_sub(pg_map.pg_stat[pgid]);
+ pg_map.pg_stat[pgid] = p->second;
+ pg_map.stat_add(pg_map.pg_stat[pgid]);
+ }
+
+ delete stats;
return true;
}
#include "PGMap.h"
+class MPGStats;
+class MStatfs;
+
class PGMonitor : public PaxosService {
public:
-
private:
PGMap pg_map;
PGMap::Incremental pending_inc;
bool preprocess_query(Message *m); // true if processed.
bool prepare_update(Message *m);
-
+ void handle_statfs(MStatfs *statfs);
+ bool handle_pg_stats(MPGStats *stats);
+
public:
PGMonitor(Monitor *mn, Paxos *p) : PaxosService(mn, p) { }
if (fm_shutdown) break;
fakemessenger_do_loop_2();
- if (directory.empty()) break;
+ if (directory.empty() && nranks > 0) break;
dout(20) << "thread waiting" << endl;
if (fm_shutdown) break;
{
//lock.Lock();
dout(18) << "do_loop begin." << endl;
-
+
while (1) {
bool didone = false;
_myinst.name = me;
_myinst.addr.port = nranks++;
//if (!me.is_mon())
- //_myinst.addr.nonce = getpid();
+ _myinst.addr.nonce = getpid();
// add to directory
directory[ _myinst.addr ] = this;
m->set_source(get_myname(), fromport);
m->set_source_addr(get_myaddr());
- m->set_dest(inst.name, port);
+ m->set_dest_inst(inst);
+ m->set_dest_port(port);
lock.Lock();
// queue
if (directory.count(inst.addr) &&
shutdown_set.count(inst.addr) == 0) {
- dout(1) << "--> " << get_myname() << " -> " << inst.name << " --- " << *m << endl;
+ dout(1) << "--> " << get_myname() << " -> " << inst.name << " --- " << *m << " -- " << m
+ << endl;
directory[inst.addr]->queue_incoming(m);
} else {
dout(0) << "--> " << get_myname() << " -> " << inst.name << " " << *m << " -- " << m
- << " *** destination DNE ***"
+ << " *** destination " << inst.addr << " DNE ***"
<< endl;
for (map<entity_addr_t, FakeMessenger*>::iterator p = directory.begin();
p != directory.end();
#include "messages/MGenericMessage.h"
+#include "messages/MPGStats.h"
+#include "messages/MStatfs.h"
+
#include "messages/MMonCommand.h"
#include "messages/MMonCommandAck.h"
#include "messages/MMonPaxos.h"
#include "messages/MDiscover.h"
#include "messages/MDiscoverReply.h"
+#include "messages/MMDSFragmentNotify.h"
+
#include "messages/MExportDirDiscover.h"
#include "messages/MExportDirDiscoverAck.h"
#include "messages/MExportDirCancel.h"
// -- with payload --
+ case MSG_PGSTATS:
+ m = new MPGStats;
+ break;
+ case MSG_STATFS:
+ m = new MStatfs;
+ break;
+
case MSG_MON_COMMAND:
m = new MMonCommand;
break;
m = new MDiscoverReply();
break;
+ case MSG_MDS_FRAGMENTNOTIFY:
+ m = new MMDSFragmentNotify;
+ break;
+
case MSG_MDS_EXPORTDIRDISCOVER:
m = new MExportDirDiscover();
break;
#define MSG_CLOSE 0
+#define MSG_STATFS 1
+#define MSG_PGSTATS 2
+
#define MSG_PING 10
#define MSG_PING_ACK 11
#define MSG_MDS_ANCHOR 130
+#define MSG_MDS_FRAGMENTNOTIFY 140
+
#define MSG_MDS_EXPORTDIRDISCOVER 149
#define MSG_MDS_EXPORTDIRDISCOVERACK 150
#define MSG_MDS_EXPORTDIRCANCEL 151
typedef struct {
- int type;
+ int32_t type;
entity_inst_t src, dst;
- int source_port, dest_port;
- int nchunks;
+ int32_t source_port, dest_port;
+ int32_t nchunks;
} msg_envelope_t;
#define MSG_ENVELOPE_LEN sizeof(msg_envelope_t)
// payload
bufferlist blist;
for (int i=0; i<env.nchunks; i++) {
- int size;
+ int32_t size;
if (!tcp_read( sd, (char*)&size, sizeof(size) )) {
need_to_send_close = false;
return 0;
it != blist.buffers().end();
it++) {
dout(10) << "pipe(" << peer_addr << ' ' << this << ").writer tcp_sending frag " << i << " len " << (*it).length() << endl;
- int size = (*it).length();
+ int32_t size = (*it).length();
r = tcp_write( sd, (char*)&size, sizeof(size) );
if (r < 0) {
derr(10) << "pipe(" << peer_addr << ' ' << this << ").writer error sending chunk len for " << *m << " to " << m->get_dest() << endl;
}
#else
// one big chunk
- int size = blist.length();
+ int32_t size = blist.length();
r = tcp_write( sd, (char*)&size, sizeof(size) );
if (r < 0) {
derr(10) << "pipe(" << peer_addr << ' ' << this << ").writer error sending data len for " << *m << " to " << m->get_dest() << endl;
// new typed msg_addr_t way!
class entity_name_t {
- int _type;
- int _num;
+ int32_t _type;
+ int32_t _num;
public:
static const int TYPE_MON = 1;
* tcp crap
*/
+/*
+inlined, see tcp.h
+
+
bool tcp_read(int sd, char *buf, int len)
{
while (len > 0) {
}
return 0;
}
-
+*/
int tcp_hostlookup(char *str, tcpaddr_t& ta)
{
return out;
}
-extern bool tcp_read(int sd, char *buf, int len);
-extern int tcp_write(int sd, char *buf, int len);
+inline bool tcp_read(int sd, char *buf, int len) {
+ while (len > 0) {
+ int got = ::recv( sd, buf, len, 0 );
+ if (got <= 0) {
+ //dout(18) << "tcp_read socket " << sd << " closed" << endl;
+ return false;
+ }
+ len -= got;
+ buf += got;
+ //dout(DBL) << "tcp_read got " << got << ", " << len << " left" << endl;
+ }
+ return true;
+}
+
+inline int tcp_write(int sd, char *buf, int len) {
+ //dout(DBL) << "tcp_write writing " << len << endl;
+ assert(len > 0);
+ while (len > 0) {
+ int did = ::send( sd, buf, len, 0 );
+ if (did < 0) {
+ //dout(1) << "tcp_write error did = " << did << " errno " << errno << " " << strerror(errno) << endl;
+ //cerr << "tcp_write error did = " << did << " errno " << errno << " " << strerror(errno) << endl;
+ return did;
+ }
+ len -= did;
+ buf += did;
+ //dout(DBL) << "tcp_write did " << did << ", " << len << " left" << endl;
+ }
+ return 0;
+}
+
+
extern int tcp_hostlookup(char *str, tcpaddr_t& ta);
inline bool operator==(const tcpaddr_t& a, const tcpaddr_t& b) {
if (started) cerr << "newsyn finishing" << endl;
- return 0; // whatever, cleanup hangs sometimes (stopping ebofs threads?).
+ // cd on exit, so that gmon.out (if any) goes into a separate directory for each node.
+ char s[20];
+ sprintf(s, "gmon/%d", myrank);
+ mkdir(s, 0755);
+ chdir(s);
+
+
+ return 0; // whatever, cleanup hangs sometimes (stopping ebofs threads?).
// cleanup
for (map<int,MDS*>::iterator i = mds.begin(); i != mds.end(); i++)
}
*/
+
+
return 0;
}
const bufferlist& bl,
Context *onsafe)
{
- dout(20) << "write " << oid << " len " << len << " off " << offset << endl;
-
char fn[200];
get_oname(oid,fn);
+
+ dout(20) << "write " << fn << " len " << len << " off " << offset << endl;
+
::mknod(fn, 0644, 0); // in case it doesn't exist yet.
void FakeStore::sync(Context *onsafe)
{
- if (g_conf.fakestore_fake_sync) {
+ if (g_conf.fakestore_fake_sync > 0.0) {
g_timer.add_event_after((float)g_conf.fakestore_fake_sync,
new C_FakeSync(onsafe, &unsync, &synclock, &synccond));
struct dirent *de;
while ((de = ::readdir(dir)) != 0) {
// parse
+ errno = 0;
coll_t c = strtoll(de->d_name, 0, 16);
- dout(0) << " got " << c << " errno " << errno << " on " << de->d_name << endl;
- if (errno) continue;
- ls.push_back(c);
+ if (c) ls.push_back(c);
}
::closedir(dir);
char *osd_base_path = "./osddata";
char *ebofs_base_path = "./dev";
-object_t SUPERBLOCK_OBJECT(0,0);
+static const object_t SUPERBLOCK_OBJECT(0,0);
// <hack> force remount hack for performance testing FakeStore
peer_qlen[from] = m->avg_qlen;
peer_read_time[from] = m->read_mean_time;
- //if (!m->ack)
- //messenger->send_message(new MOSDPing(osdmap->get_epoch(), true),
- //m->get_source());
-
delete m;
}
bufferlist fullmap; // in leiu of below.
// incremental
- map<int,entity_inst_t> new_up;
- map<int,entity_inst_t> new_down;
- list<int> new_in;
- list<int> new_out;
- map<int,float> new_overload; // updated overload value
- list<int> old_overload; // no longer overload
+ map<int32_t,entity_inst_t> new_up;
+ map<int32_t,entity_inst_t> new_down;
+ list<int32_t> new_in;
+ list<int32_t> new_out;
+ map<int32_t,float> new_overload; // updated overload value
+ list<int32_t> old_overload; // no longer overload
void encode(bufferlist& bl) {
::_encode(epoch, bl);
epoch_t epoch; // what epoch of the osd cluster descriptor is this
epoch_t mon_epoch; // monitor epoch (election iteration)
utime_t ctime; // epoch start time
- int pg_num; // placement group count
- int pg_num_mask; // bitmask for above
- int localized_pg_num; // localized place group count
- int localized_pg_num_mask; // ditto
+ int32_t pg_num; // placement group count
+ int32_t pg_num_mask; // bitmask for above
+ int32_t localized_pg_num; // localized place group count
+ int32_t localized_pg_num_mask; // ditto
- set<int> osds; // all osds
- set<int> down_osds; // list of down disks
- set<int> out_osds; // list of unmapped disks
- map<int,float> overload_osds;
- map<int,entity_inst_t> osd_inst;
+ set<int32_t> osds; // all osds
+ set<int32_t> down_osds; // list of down disks
+ set<int32_t> out_osds; // list of unmapped disks
+ map<int32_t,float> overload_osds;
+ map<int32_t,entity_inst_t> osd_inst;
public:
Crush crush; // hierarchical map
}
// nope, incremental.
- for (map<int,entity_inst_t>::iterator i = inc.new_down.begin();
+ for (map<int32_t,entity_inst_t>::iterator i = inc.new_down.begin();
i != inc.new_down.end();
i++) {
assert(down_osds.count(i->first) == 0);
osd_inst.erase(i->first);
//cout << "epoch " << epoch << " down osd" << i->first << endl;
}
- for (list<int>::iterator i = inc.new_out.begin();
+ for (list<int32_t>::iterator i = inc.new_out.begin();
i != inc.new_out.end();
i++) {
assert(out_osds.count(*i) == 0);
out_osds.insert(*i);
//cout << "epoch " << epoch << " out osd" << *i << endl;
}
- for (list<int>::iterator i = inc.old_overload.begin();
+ for (list<int32_t>::iterator i = inc.old_overload.begin();
i != inc.old_overload.end();
i++) {
assert(overload_osds.count(*i));
overload_osds.erase(*i);
}
- for (map<int,entity_inst_t>::iterator i = inc.new_up.begin();
+ for (map<int32_t,entity_inst_t>::iterator i = inc.new_up.begin();
i != inc.new_up.end();
i++) {
assert(down_osds.count(i->first));
osd_inst[i->first] = i->second;
//cout << "epoch " << epoch << " up osd" << i->first << endl;
}
- for (list<int>::iterator i = inc.new_in.begin();
+ for (list<int32_t>::iterator i = inc.new_in.begin();
i != inc.new_in.end();
i++) {
assert(out_osds.count(*i));
out_osds.erase(*i);
//cout << "epoch " << epoch << " in osd" << *i << endl;
}
- for (map<int,float>::iterator i = inc.new_overload.begin();
+ for (map<int32_t,float>::iterator i = inc.new_overload.begin();
i != inc.new_overload.end();
i++) {
overload_osds[i->first] = i->second;
static const int OP_COLL_SETATTR = 24; // cid, attrname, attrval
static const int OP_COLL_RMATTR = 25; // cid, attrname
- list<int> ops;
+ private:
+ list<int8_t> ops;
list<bufferlist> bls;
list<object_t> oids;
- list<coll_t> cids;
- list<off_t> offsets;
- list<size_t> lengths;
+ list<coll_t> cids;
+ list<int64_t> lengths;
list<const char*> attrnames;
list<string> attrnames2;
- //list< pair<const void*,int> > attrvals;
- list<bufferlist> attrbls;
// for reads only (not encoded)
list<bufferlist*> pbls;
list< pair<void*,int*> > pattrvals;
list< map<string,bufferptr>* > pattrsets;
- const char *get_attrname() {
- if (attrnames.empty())
- return attrnames2.front().c_str();
- else
- return attrnames.front();
+ public:
+ bool have_op() {
+ return !ops.empty();
+ }
+ int get_num_ops() { return ops.size(); }
+ int get_op() {
+ int op = ops.front();
+ ops.pop_front();
+ return op;
+ }
+ void get_bl(bufferlist& bl) {
+ bl.claim(bls.front());
+ bls.pop_front();
+ }
+ void get_oid(object_t& oid) {
+ oid = oids.front();
+ oids.pop_front();
+ }
+ void get_cid(coll_t& cid) {
+ cid = cids.front();
+ cids.pop_front();
+ }
+ void get_length(off_t& len) {
+ len = lengths.front();
+ lengths.pop_front();
+ }
+ void get_attrname(const char * &p) {
+ p = attrnames.front();
+ attrnames.pop_front();
}
- void pop_attrname() {
- if (attrnames.empty())
- attrnames2.pop_front();
- else
- attrnames.pop_front();
+ void get_pbl(bufferlist* &pbl) {
+ pbl = pbls.front();
+ pbls.pop_front();
}
+ void get_pstat(struct stat* &pst) {
+ pst = psts.front();
+ psts.pop_front();
+ }
+ void get_pattrval(pair<void*,int*>& p) {
+ p = pattrvals.front();
+ pattrvals.pop_front();
+ }
+ void get_pattrset(map<string,bufferptr>* &ps) {
+ ps = pattrsets.front();
+ pattrsets.pop_front();
+ }
+
void read(object_t oid, off_t off, size_t len, bufferlist *pbl) {
int op = OP_READ;
ops.push_back(op);
oids.push_back(oid);
- offsets.push_back(off);
+ lengths.push_back(off);
lengths.push_back(len);
pbls.push_back(pbl);
}
int op = OP_WRITE;
ops.push_back(op);
oids.push_back(oid);
- offsets.push_back(off);
+ lengths.push_back(off);
lengths.push_back(len);
bls.push_back(bl);
}
int op = OP_TRIMCACHE;
ops.push_back(op);
oids.push_back(oid);
- offsets.push_back(off);
+ lengths.push_back(off);
lengths.push_back(len);
}
void truncate(object_t oid, off_t off) {
int op = OP_TRUNCATE;
ops.push_back(op);
oids.push_back(oid);
- offsets.push_back(off);
+ lengths.push_back(off);
}
void remove(object_t oid) {
int op = OP_REMOVE;
//attrvals.push_back(pair<const void*,int>(val,len));
bufferlist bl;
bl.append((char*)val,len);
- attrbls.push_back(bl);
+ bls.push_back(bl);
}
void setattrs(object_t oid, map<string,bufferptr>& attrset) {
int op = OP_SETATTRS;
ops.push_back(op);
cids.push_back(cid);
attrnames.push_back(name);
- //attrvals.push_back(pair<const void*,int>(val,len));
bufferlist bl;
bl.append((char*)val, len);
- attrbls.push_back(bl);
+ bls.push_back(bl);
}
void collection_rmattr(coll_t cid, const char* name) {
int op = OP_COLL_RMATTR;
::_encode(bls, bl);
::_encode(oids, bl);
::_encode(cids, bl);
- ::_encode(offsets, bl);
::_encode(lengths, bl);
::_encode(attrnames, bl);
- ::_encode(attrbls, bl);
}
void _decode(bufferlist& bl, int& off) {
::_decode(ops, bl, off);
::_decode(bls, bl, off);
::_decode(oids, bl, off);
::_decode(cids, bl, off);
- ::_decode(offsets, bl, off);
::_decode(lengths, bl, off);
::_decode(attrnames2, bl, off);
- ::_decode(attrbls, bl, off);
+ for (list<string>::iterator p = attrnames2.begin();
+ p != attrnames2.end();
+ ++p)
+ attrnames.push_back((*p).c_str());
}
};
*/
virtual unsigned apply_transaction(Transaction& t, Context *onsafe=0) {
// non-atomic implementation
- for (list<int>::iterator p = t.ops.begin();
- p != t.ops.end();
- p++) {
- switch (*p) {
+ while (t.have_op()) {
+ int op = t.get_op();
+ switch (op) {
case Transaction::OP_READ:
{
- object_t oid = t.oids.front(); t.oids.pop_front();
- off_t offset = t.offsets.front(); t.offsets.pop_front();
- size_t len = t.lengths.front(); t.lengths.pop_front();
- bufferlist *pbl = t.pbls.front(); t.pbls.pop_front();
+ object_t oid;
+ off_t offset, len;
+ t.get_oid(oid);
+ t.get_length(offset);
+ t.get_length(len);
+ bufferlist *pbl;
+ t.get_pbl(pbl);
read(oid, offset, len, *pbl);
}
break;
case Transaction::OP_STAT:
{
- object_t oid = t.oids.front(); t.oids.pop_front();
- struct stat *st = t.psts.front(); t.psts.pop_front();
+ object_t oid;
+ t.get_oid(oid);
+ struct stat *st;
+ t.get_pstat(st);
stat(oid, st);
}
break;
case Transaction::OP_GETATTR:
{
- object_t oid = t.oids.front(); t.oids.pop_front();
- const char *attrname = t.get_attrname(); t.pop_attrname();
- pair<void*,int*> pattrval = t.pattrvals.front(); t.pattrvals.pop_front();
+ object_t oid;
+ t.get_oid(oid);
+ const char *attrname;
+ t.get_attrname(attrname);
+ pair<void*,int*> pattrval;
+ t.get_pattrval(pattrval);
*pattrval.second = getattr(oid, attrname, pattrval.first, *pattrval.second);
}
break;
case Transaction::OP_GETATTRS:
{
- object_t oid = t.oids.front(); t.oids.pop_front();
- map<string,bufferptr> *pset = t.pattrsets.front(); t.pattrsets.pop_front();
+ object_t oid;
+ t.get_oid(oid);
+ map<string,bufferptr> *pset;
+ t.get_pattrset(pset);
getattrs(oid, *pset);
}
break;
case Transaction::OP_WRITE:
{
- object_t oid = t.oids.front(); t.oids.pop_front();
- off_t offset = t.offsets.front(); t.offsets.pop_front();
- size_t len = t.lengths.front(); t.lengths.pop_front();
- bufferlist bl = t.bls.front(); t.bls.pop_front();
+ object_t oid;
+ t.get_oid(oid);
+ off_t offset, len;
+ t.get_length(offset);
+ t.get_length(len);
+ bufferlist bl;
+ t.get_bl(bl);
write(oid, offset, len, bl, 0);
}
break;
case Transaction::OP_TRIMCACHE:
{
- object_t oid = t.oids.front(); t.oids.pop_front();
- off_t offset = t.offsets.front(); t.offsets.pop_front();
- size_t len = t.lengths.front(); t.lengths.pop_front();
+ object_t oid;
+ t.get_oid(oid);
+ off_t offset, len;
+ t.get_length(offset);
+ t.get_length(len);
trim_from_cache(oid, offset, len);
}
break;
case Transaction::OP_TRUNCATE:
{
- object_t oid = t.oids.front(); t.oids.pop_front();
- off_t len = t.offsets.front(); t.offsets.pop_front();
+ object_t oid;
+ t.get_oid(oid);
+ off_t len;
+ t.get_length(len);
truncate(oid, len, 0);
}
break;
case Transaction::OP_REMOVE:
{
- object_t oid = t.oids.front(); t.oids.pop_front();
+ object_t oid;
+ t.get_oid(oid);
remove(oid, 0);
}
break;
case Transaction::OP_SETATTR:
{
- object_t oid = t.oids.front(); t.oids.pop_front();
- const char *attrname = t.get_attrname(); t.pop_attrname();
- //pair<const void*,int> attrval = t.attrvals.front(); t.attrvals.pop_front();
+ object_t oid;
+ t.get_oid(oid);
+ const char *attrname;
+ t.get_attrname(attrname);
bufferlist bl;
- bl.claim( t.attrbls.front() );
- t.attrbls.pop_front();
+ t.get_bl(bl);
setattr(oid, attrname, bl.c_str(), bl.length(), 0);
}
break;
case Transaction::OP_SETATTRS:
{
- object_t oid = t.oids.front(); t.oids.pop_front();
- map<string,bufferptr> *pattrset = t.pattrsets.front(); t.pattrsets.pop_front();
+ object_t oid;
+ t.get_oid(oid);
+ map<string,bufferptr> *pattrset;
+ t.get_pattrset(pattrset);
setattrs(oid, *pattrset, 0);
}
break;
case Transaction::OP_RMATTR:
{
- object_t oid = t.oids.front(); t.oids.pop_front();
- const char *attrname = t.get_attrname(); t.pop_attrname();
+ object_t oid;
+ t.get_oid(oid);
+ const char *attrname;
+ t.get_attrname(attrname);
rmattr(oid, attrname, 0);
}
break;
case Transaction::OP_CLONE:
{
- object_t oid = t.oids.front(); t.oids.pop_front();
- object_t noid = t.oids.front(); t.oids.pop_front();
+ object_t oid;
+ t.get_oid(oid);
+ object_t noid;
+ t.get_oid(noid);
clone(oid, noid);
}
break;
case Transaction::OP_MKCOLL:
{
- coll_t cid = t.cids.front(); t.cids.pop_front();
+ coll_t cid;
+ t.get_cid(cid);
create_collection(cid, 0);
}
break;
case Transaction::OP_RMCOLL:
{
- coll_t cid = t.cids.front(); t.cids.pop_front();
+ coll_t cid;
+ t.get_cid(cid);
destroy_collection(cid, 0);
}
break;
case Transaction::OP_COLL_ADD:
{
- coll_t cid = t.cids.front(); t.cids.pop_front();
- object_t oid = t.oids.front(); t.oids.pop_front();
+ coll_t cid;
+ t.get_cid(cid);
+ object_t oid;
+ t.get_oid(oid);
collection_add(cid, oid, 0);
}
break;
case Transaction::OP_COLL_REMOVE:
{
- coll_t cid = t.cids.front(); t.cids.pop_front();
- object_t oid = t.oids.front(); t.oids.pop_front();
+ coll_t cid;
+ t.get_cid(cid);
+ object_t oid;
+ t.get_oid(oid);
collection_remove(cid, oid, 0);
}
break;
case Transaction::OP_COLL_SETATTR:
{
- coll_t cid = t.cids.front(); t.cids.pop_front();
- const char *attrname = t.get_attrname(); t.pop_attrname();
- //pair<const void*,int> attrval = t.attrvals.front(); t.attrvals.pop_front();
+ coll_t cid;
+ t.get_cid(cid);
+ const char *attrname;
+ t.get_attrname(attrname);
bufferlist bl;
- bl.claim( t.attrbls.front() );
- t.attrbls.pop_front();
+ t.get_bl(bl);
collection_setattr(cid, attrname, bl.c_str(), bl.length(), 0);
}
break;
case Transaction::OP_COLL_RMATTR:
{
- coll_t cid = t.cids.front(); t.cids.pop_front();
- const char *attrname = t.get_attrname(); t.pop_attrname();
+ coll_t cid;
+ t.get_cid(cid);
+ const char *attrname;
+ t.get_attrname(attrname);
collection_rmattr(cid, attrname, 0);
}
break;
default:
- cerr << "bad op " << *p << endl;
+ cerr << "bad op " << op << endl;
assert(0);
}
}
off_t trim = p->first;
dout(10) << " trimming ondisklog to [" << ondisklog.bottom << "," << ondisklog.top << ")" << dendl;
+ assert(trim >= ondisklog.bottom);
ondisklog.bottom = trim;
// adjust block_map
// load bounds
ondisklog.bottom = ondisklog.top = 0;
r = store->collection_getattr(info.pgid, "ondisklog_bottom", &ondisklog.bottom, sizeof(ondisklog.bottom));
- assert(r == sizeof(ondisklog.bottom));
+ //assert(r == sizeof(ondisklog.bottom));
r = store->collection_getattr(info.pgid, "ondisklog_top", &ondisklog.top, sizeof(ondisklog.top));
- assert(r == sizeof(ondisklog.top));
+ //assert(r == sizeof(ondisklog.top));
dout(10) << "read_log [" << ondisklog.bottom << "," << ondisklog.top << ")" << dendl;
// read
bufferlist bl;
store->read(info.pgid.to_object(), ondisklog.bottom, ondisklog.top-ondisklog.bottom, bl);
+ if (bl.length() < ondisklog.top-ondisklog.bottom) {
+ dout(0) << "read_log data doesn't match attrs" << dendl;
+ assert(0);
+ }
PG::Log::Entry e;
off_t pos = ondisklog.bottom;
class osdreqid_t {
public:
entity_name_t name; // who
- int inc; // incarnation
+ int32_t inc; // incarnation
tid_t tid;
osdreqid_t() : inc(0), tid(0) {}
osdreqid_t(const entity_name_t& a, int i, tid_t t) : name(a), inc(i), tid(t) {}
// pg stuff
-#define PG_INO 1
+#define PG_INO 4 // this should match mds/mdstypes.h MDS_INO_PG
typedef uint16_t ps_t;
typedef uint8_t pruleset_t;
}
+
+
+
/** ObjectLayout
*
* describes an object's placement and layout in the storage cluster.
*/
struct ObjectLayout {
pg_t pgid; // what pg do i belong to
- int stripe_unit; // for object raid in raid pgs
+ int32_t stripe_unit; // for object raid in raid pgs
ObjectLayout() : pgid(0), stripe_unit(0) { }
ObjectLayout(pg_t p, int su=0) : pgid(p), stripe_unit(su) { }
+/** pg_stat
+ * aggregate stats for a single PG.
+ */
+struct pg_stat_t {
+ const static int STATE_UNKNOWN = 0;
+ const static int STATE_OK = 1;
+ const static int STATE_RECOVERING = 2;
+ const static int STATE_OFFLINE = 3;
+
+ eversion_t reported;
+
+ int32_t state;
+ int64_t size; // in bytes
+ int64_t num_blocks; // in 4k blocks
+
+ pg_stat_t() : state(0), size(0), num_blocks(0) {}
+};
+
// -----------------------------------------
const static uint64_t MAGIC = 0xeb0f505dULL;
uint64_t magic;
uint64_t fsid; // unique fs id (random number)
- int whoami; // my role in this fs.
+ int32_t whoami; // my role in this fs.
epoch_t current_epoch; // most recent epoch
epoch_t oldest_map, newest_map; // oldest/newest maps we have.
OSDSuperblock(uint64_t f=0, int w=0) :
#include "config.h"
#undef dout
-#define dout(x) if (x <= g_conf.debug || x <= g_conf.debug_objecter) cout << g_clock.now() << " " << objecter->messenger->get_myname() << ".journaler "
-#define derr(x) if (x <= g_conf.debug || x <= g_conf.debug_objecter) cerr << g_clock.now() << " " << objecter->messenger->get_myname() << ".journaler "
+#define dout(x) if (x <= g_conf.debug || x <= g_conf.debug_journaler) cout << g_clock.now() << " " << objecter->messenger->get_myname() << ".journaler "
+#define derr(x) if (x <= g_conf.debug || x <= g_conf.debug_journaler) cerr << g_clock.now() << " " << objecter->messenger->get_myname() << ".journaler "
off_t Journaler::append_entry(bufferlist& bl, Context *onsync)
{
- size_t s = bl.length();
+ uint32_t s = bl.length();
if (!g_conf.journaler_allow_split_entries) {
// will we span a stripe boundary?
}
}
- dout(10) << "append_entry len " << bl.length() << " to " << write_pos << "~" << (bl.length() + sizeof(size_t)) << endl;
+ dout(10) << "append_entry len " << bl.length() << " to " << write_pos << "~" << (bl.length() + sizeof(uint32_t)) << endl;
+ // cache?
+ // NOTE: this is a dumb thing to do; this is used for a benchmarking
+ // purposes only.
+ if (g_conf.journaler_cache &&
+ write_pos == read_pos + read_buf.length()) {
+ dout(10) << "append_entry caching in read_buf too" << endl;
+ assert(requested_pos == received_pos);
+ assert(requested_pos == read_pos + read_buf.length());
+ read_buf.append((char*)&s, sizeof(s));
+ read_buf.append(bl);
+ requested_pos = received_pos = write_pos + sizeof(s) + s;
+ }
+
// append
write_buf.append((char*)&s, sizeof(s));
- write_buf.append(bl);
+ write_buf.claim_append(bl);
write_pos += sizeof(s) + s;
// flush now?
if (read_pos == write_pos) return false;
// have enough for entry size?
- size_t s = 0;
+ uint32_t s = 0;
if (read_buf.length() >= sizeof(s))
read_buf.copy(0, sizeof(s), (char*)&s);
return false;
}
- size_t s;
+ uint32_t s;
assert(read_buf.length() >= sizeof(s));
read_buf.copy(0, sizeof(s), (char*)&s);
assert(read_buf.length() >= sizeof(s) + s);
BufferHead *bh = p->second;
dout(10) << "map_write bh " << *bh << " intersected" << endl;
- /*if (bh->is_dirty()) {
- // already dirty, let's use it.
- final = bh;
- } else {
- */
if (p->first < cur) {
assert(final == 0);
if (cur + max >= p->first + p->second->length()) {
split(final, cur+max);
}
} else if (p->first == cur) {
- /*if (bh->is_dirty()) {
- // already dirty, use it.
- }
- else*/
if (p->second->length() <= max) {
// whole bufferhead, piece of cake.
} else {
split(bh, cur + max); // just split
}
if (final)
- merge_left(final,bh);
+ merge_left(final, bh);
else
final = bh;
}
dout(7) << "bh_read_finish "
<< oid
<< " " << start << "~" << length
+ << " (bl is " << bl.length() << ")"
<< endl;
+
+ if (bl.length() < length) {
+ bufferptr bp(length - bl.length());
+ bp.zero();
+ dout(7) << "bh_read_finish " << oid << " padding " << start << "~" << length
+ << " with " << bp.length() << " bytes of zeroes" << endl;
+ bl.push_back(bp);
+ }
if (objects.count(oid) == 0) {
dout(7) << "bh_read_finish no object cache" << endl;
for (map<off_t, BufferHead*>::iterator bh_it = hits.begin();
bh_it != hits.end();
bh_it++) {
- dout(10) << "readx hit bh " << *bh_it->second << endl;
+ dout(10) << "readx hit bh " << *bh_it->second << endl;
hit_ls.push_back(bh_it->second);
- }
+ }
// create reverse map of buffer offset -> object for the eventual result.
// this is over a single ObjectExtent, so we know that
// - the buffer frags need not be (and almost certainly aren't)
off_t opos = ex_it->start;
map<off_t, BufferHead*>::iterator bh_it = hits.begin();
- assert(bh_it->second->start() <= opos);
+ assert(bh_it->second->start() <= opos);
size_t bhoff = opos - bh_it->second->start();
map<size_t,size_t>::iterator f_it = ex_it->buffer_extents.begin();
size_t foff = 0;
size_t len = MIN(f_it->second - foff,
bh->length() - bhoff);
- stripe_map[f_it->first].substr_of(bh->bl,
- opos - bh->start(),
- len);
+ bufferlist bit; // put substr here first, since substr_of clobbers, and
+ // we may get multiple bh's at this stripe_map position
+ bit.substr_of(bh->bl,
+ opos - bh->start(),
+ len);
+ stripe_map[f_it->first].claim_append(bit);
+
opos += len;
bhoff += len;
foff += len;
dout(10) << "readx adding buffer len " << i->second.length() << " at " << pos << endl;
pos += i->second.length();
rd->bl->claim_append(i->second);
+ assert(rd->bl->length() == pos);
}
dout(10) << "readx result is " << rd->bl->length() << endl;
touch_bh(bh);
bh->last_write = now;
- // recombine with left?
+ // combine with left?
map<off_t,BufferHead*>::iterator p = o->data.find(bh->start());
+ assert(p->second == bh);
if (p != o->data.begin()) {
- assert(p->second == bh);
p--;
- if (p->second->is_dirty()) {
+ if (p->second->is_dirty() &&
+ p->second->end() == bh->start()) {
o->merge_left(p->second, bh);
bh = p->second;
- }
+ } else
+ p++;
}
- // right?
- while (1) {
- p = o->data.find(bh->start());
- assert(p->second == bh);
- p++;
- if (p == o->data.end() || !p->second->is_dirty()) break;
+ // combine to the right?
+ assert(p->second == bh);
+ p++;
+ if (p != o->data.end() &&
+ !p->second->is_dirty() &&
+ p->second->start() > bh->end())
o->merge_left(bh, p->second);
- }
}
delete wr;
ObjectCacher *oc;
object_t oid; // this _always_ is oid.rev=0
inodeno_t ino;
- objectrev_t rev; // last rev we're written
- ObjectLayout layout;
+ objectrev_t rev; // last rev we're written
+ ObjectLayout layout;
public:
map<off_t, BufferHead*> data;
if (bh.is_dirty()) out << " dirty";
if (bh.is_clean()) out << " clean";
if (bh.is_missing()) out << " missing";
+ if (bh.bl.length() > 0) out << " firstbyte=" << (int)bh.bl[0];
out << "]";
return out;
}
!g_conf.objecter_buffer_uncommitted) {
dout(0) << "kick_requests missing commit, cannot replay: objecter_buffer_uncommitted == FALSE" << endl;
} else {
- dout(0) << "kick_requests missing commit, replay write " << tid
+ dout(3) << "kick_requests missing commit, replay write " << tid
<< " v " << wr->tid_version[tid] << endl;
modifyx_submit(wr, wr->waitfor_commit[tid], tid);
}
}
else if (wr->waitfor_ack.count(tid)) {
- dout(0) << "kick_requests missing ack, resub write " << tid << endl;
+ dout(3) << "kick_requests missing ack, resub write " << tid << endl;
modifyx_submit(wr, wr->waitfor_ack[tid], tid);
}
}
// READ
OSDRead *rd = op_read[tid];
op_read.erase(tid);
- dout(0) << "kick_requests resub read " << tid << endl;
+ dout(3) << "kick_requests resub read " << tid << endl;
// resubmit
readx_submit(rd, rd->ops[tid], true);
rd->ops.erase(tid);
}
- else if (op_stat.count(tid)) {
- OSDStat *st = op_stat[tid];
- op_stat.erase(tid);
-
- dout(0) << "kick_requests resub stat " << tid << endl;
+ else if (op_stat.count(tid)) {
+ OSDStat *st = op_stat[tid];
+ op_stat.erase(tid);
+
+ dout(3) << "kick_requests resub stat " << tid << endl;
// resubmit
stat_submit(st);
- }
+ }
else
assert(0);
--- /dev/null
+
+#include "include/types.h"
+#include "include/frag.h"
+
+int main(int argc, char **argv)
+{
+ fragtree_t tree;
+ tree.split(frag_t(),2);
+ tree.split(frag_t(0,2),1);
+ tree.split(frag_t(1,2),1);
+ tree.split(frag_t(2,2),1);
+ tree.split(frag_t(1,3),1);
+
+ cout << "tree is " << tree << endl;
+ frag_t fg(2,4);
+ cout << "fg is " << fg << endl;
+ tree.force_to_leaf(fg);
+
+}
--- /dev/null
+
+#include "common/DecayCounter.h"
+
+#include <list>
+using namespace std;
+
+struct RealCounter {
+public:
+ list<int> hits;
+
+ void hit(int ms) {
+ hits.push_back(ms);
+ }
+
+ int get(double hl, int now) {
+ trim(now-hl);
+ return hits.size();
+ }
+
+ void trim(int to) {
+ while (!hits.empty() &&
+ hits.front() < to)
+ hits.pop_front();
+ }
+
+
+};
+
+int main(int argc, char **argv)
+{
+ int target;
+ double hl = atof(argv[1]);
+ cerr << "halflife " << hl << endl;
+
+ DecayCounter dc(hl);
+ RealCounter rc;
+
+ utime_t now = g_clock.now();
+
+ for (int ms=0; ms < 300*1000; ms++) {
+ if (ms % 30000 == 0) {
+ target = 1 + (rand() % 10) * 10;
+ if (ms > 200000) target = 0;
+ }
+
+ if (target &&
+ (rand() % (1000/target) == 0)) {
+ dc.hit();
+ rc.hit(ms);
+ }
+
+ if (ms % 500 == 0) dc.get(now);
+ if (ms % 100 == 0) {
+ //dc.get(now);
+ DecayCounter o = dc;
+ cout << ms << "\t"
+ << target*hl << "\t"
+ << rc.get(hl*1000, ms) << "\t"
+ << o.get(now) << "\t"
+ << dc.val << "\t"
+ // << dc.delta << "\t"
+ << o.get_last_vel() << "\t"
+ << o.get_last() + o.get_last_vel() << "\t"
+ << endl;
+ }
+
+ now += .001;
+ }
+
+}