From 8c7283a709cbd1031e4adc5c129a44d81b7a0f8d Mon Sep 17 00:00:00 2001 From: sage Date: Tue, 19 Jul 2005 18:08:29 +0000 Subject: [PATCH] lots of cleanup clock stuff bug fixes client capabilities! object store attributes, collections (untested in FakeStore) git-svn-id: https://ceph.svn.sf.net/svnroot/ceph@463 29311d96-e01e-0410-9327-a35deaab8ce9 --- ceph/Makefile | 18 +- ceph/TODO | 59 +---- ceph/client/Buffercache.cc | 6 +- ceph/client/Buffercache.h | 8 +- ceph/client/Client.cc | 317 +++++++++++++++---------- ceph/client/Client.h | 73 ++++-- ceph/client/SyntheticClient.cc | 44 ++-- ceph/client/SyntheticClient.h | 6 +- ceph/client/Trace.cc | 2 +- ceph/client/fuse.cc | 50 +--- ceph/common/Clock.cc | 5 + ceph/common/Clock.h | 144 +++++++---- ceph/common/Cond.h | 8 +- ceph/common/DecayCounter.h | 29 +-- ceph/common/LogType.h | 9 - ceph/common/Logger.cc | 10 +- ceph/common/Logger.h | 9 +- ceph/common/Mutex.h | 2 +- ceph/common/ThreadPool.h | 4 +- ceph/common/Timer.cc | 25 +- ceph/common/Timer.h | 20 +- ceph/common/clock.cc | 55 ----- ceph/config.cc | 5 +- ceph/fakefuse.cc | 4 +- ceph/fakesyn.cc | 4 +- ceph/include/Distribution.h | 12 +- ceph/include/buffer.h | 50 ++-- ceph/include/bufferlist.h | 38 +-- ceph/include/filepath.h | 8 +- ceph/include/lru.h | 6 +- ceph/include/types.h | 42 +++- ceph/mds/AnchorTable.cc | 4 +- ceph/mds/CDentry.h | 26 +- ceph/mds/CDir.cc | 17 +- ceph/mds/CDir.h | 27 +-- ceph/mds/CFile.h | 69 ------ ceph/mds/CInode.cc | 6 +- ceph/mds/CInode.h | 129 ++++------ ceph/mds/Capability.h | 142 ++++++++++- ceph/mds/IdAllocator.cc | 10 +- ceph/mds/IdAllocator.h | 2 +- ceph/mds/Lock.h | 5 +- ceph/mds/LogEvent.h | 3 +- ceph/mds/LogStream.cc | 2 +- ceph/mds/MDBalancer.cc | 80 +++---- ceph/mds/MDBalancer.h | 2 +- ceph/mds/MDCache.cc | 160 +++++++------ ceph/mds/MDCache.h | 13 +- ceph/mds/MDCluster.cc | 2 +- ceph/mds/MDLog.cc | 7 +- ceph/mds/MDLog.h | 4 +- ceph/mds/MDS.cc | 191 +++++++-------- ceph/mds/MDS.h | 4 +- ceph/mds/MDStore.cc | 32 +-- ceph/mds/MDStore.h | 10 +- ceph/mds/OSDMonitor.cc | 2 +- ceph/mds/events/EAlloc.h | 2 +- ceph/mds/events/EInodeUpdate.h | 2 +- ceph/mds/events/EUnlink.h | 2 +- ceph/messages/MAnchorReply.h | 2 +- ceph/messages/MAnchorRequest.h | 2 +- ceph/messages/MClientFileCaps.h | 22 +- ceph/messages/MClientInodeAuthUpdate.h | 16 +- ceph/messages/MClientMountAck.h | 2 +- ceph/messages/MClientReply.h | 11 +- ceph/messages/MClientRequest.h | 4 +- ceph/msg/CheesySerializer.cc | 2 +- ceph/msg/FakeMessenger.cc | 36 +-- ceph/msg/HostMonitor.cc | 4 +- ceph/msg/MPIMessenger.cc | 14 +- ceph/msg/Message.h | 2 +- ceph/msg/Messenger.cc | 16 +- ceph/msg/TCPMessenger.cc | 17 +- ceph/osd/BDBMap.h | 97 ++++++++ ceph/osd/FakeStore.cc | 188 ++++++++------- ceph/osd/FakeStore.h | 68 +++--- ceph/osd/OBFSStore.cc | 2 +- ceph/osd/OSD.cc | 2 +- ceph/osd/OSDMap.h | 4 +- ceph/osd/ObjectStore.h | 28 ++- ceph/osd/rush.cc | 2 +- ceph/osdc/Filer.cc | 50 ++-- ceph/osdc/Filer.h | 12 +- ceph/tcpfuse.cc | 2 +- ceph/tcpsyn.cc | 2 +- ceph/test/fakemds.cc | 2 +- ceph/test/mpitest.cc | 2 +- ceph/test/testmpi.cc | 2 +- 88 files changed, 1418 insertions(+), 1222 deletions(-) create mode 100644 ceph/common/Clock.cc delete mode 100644 ceph/common/clock.cc delete mode 100644 ceph/mds/CFile.h create mode 100644 ceph/osd/BDBMap.h diff --git a/ceph/Makefile b/ceph/Makefile index 56472d2bb4765..c12c6b4c5e938 100644 --- a/ceph/Makefile +++ b/ceph/Makefile @@ -8,8 +8,8 @@ # This makes it less annoying to build on non-mpi hosts for dev work, and seems to # behave just fine... change ${CC} back to mpicxx if you get paranoid. CC = g++ -CFLAGS = -O2 -I. -D_FILE_OFFSET_BITS=64 -DMPICH_IGNORE_CXX_SEEK -D_REENTRANT -D_THREAD_SAFE -LIBS = -lpthread -lrt +CFLAGS = -g -Wall -I. -D_FILE_OFFSET_BITS=64 -DMPICH_IGNORE_CXX_SEEK -D_REENTRANT -D_THREAD_SAFE +LIBS = -lpthread -lrt -ldb #for normal mpich2 machines MPICC = mpicxx @@ -47,11 +47,12 @@ COMMON_OBJS= \ osd/OSDCluster.o\ osd/rush.o\ common/Logger.o\ - common/clock.o\ + common/Clock.o\ common/Timer.o\ config.o SYN_OBJS = \ + client/SyntheticClient.o\ client/Trace.o TEST_TARGETS = fakemds mpitest @@ -96,20 +97,21 @@ mttest: test/mttest.cc msg/MTMessenger.cc ${COMMON_OBJS} fakefuse: fakefuse.cc mds/allmds.o client/Client.o client/Buffercache.o osd/OSD.o client/fuse.o msg/FakeMessenger.cc ${COMMON_OBJS} ${CC} -pg ${CFLAGS} ${LIBS} -lfuse $^ -o $@ -tcpfuse: tcpfuse.cc mds/allmds.o client/Client.o client/Buffercache.o client/SyntheticClient.o osd/OSD.o client/fuse.o msg/TCPMessenger.cc ${COMMON_OBJS} +tcpfuse: tcpfuse.cc mds/allmds.o client/Client.o client/Buffercache.o osd/OSD.o client/fuse.o msg/TCPMessenger.cc ${COMMON_OBJS} ${MPICC} ${CFLAGS} ${LIBS} -lfuse $^ -o $@ -mpifuse: mpifuse.cc mds/allmds.o client/Client.o client/Buffercache.o client/SyntheticClient.o osd/OSD.o client/fuse.o msg/MPIMessenger.cc ${COMMON_OBJS} +mpifuse: mpifuse.cc mds/allmds.o client/Client.o client/Buffercache.o osd/OSD.o client/fuse.o msg/MPIMessenger.cc ${COMMON_OBJS} ${MPICC} ${CFLAGS} ${LIBS} -lfuse $^ -o $@ + # synthetic workload -fakesyn: fakesyn.cc mds/allmds.o client/Client.o client/Buffercache.o client/SyntheticClient.o osd/OSD.o msg/FakeMessenger.o ${COMMON_OBJS} ${SYN_OBJS} +fakesyn: fakesyn.cc mds/allmds.o client/Client.o client/Buffercache.o osd/OSD.o msg/FakeMessenger.o ${COMMON_OBJS} ${SYN_OBJS} ${CC} -pg ${CFLAGS} ${LIBS} $^ -o $@ -mpisyn: mpisyn.cc mds/allmds.o client/Client.o client/Buffercache.o client/SyntheticClient.o osd/OSD.o msg/MPIMessenger.cc ${COMMON_OBJS} ${SYN_OBJS} +mpisyn: mpisyn.cc mds/allmds.o client/Client.o client/Buffercache.o osd/OSD.o msg/MPIMessenger.cc ${COMMON_OBJS} ${SYN_OBJS} ${MPICC} ${MPICFLAGS} ${MPILIBS} $^ -o $@ -tcpsyn: tcpsyn.cc mds/allmds.o client/Client.o client/Buffercache.o client/SyntheticClient.o osd/OSD.o msg/TCPMessenger.cc ${COMMON_OBJS} ${SYN_OBJS} +tcpsyn: tcpsyn.cc mds/allmds.o client/Client.o client/Buffercache.o osd/OSD.o msg/TCPMessenger.cc ${COMMON_OBJS} ${SYN_OBJS} ${MPICC} ${MPICFLAGS} ${MPILIBS} $^ -o $@ # obfs + synthetic diff --git a/ceph/TODO b/ceph/TODO index 0566cd6eb69d3..331576a8083e7 100644 --- a/ceph/TODO +++ b/ceph/TODO @@ -1,64 +1,27 @@ -MDS tests -/- scaling (graph) -- latency (table) - - w/wo journal - ...proc at mds, 152us vs 2.5ms - ...total rtt from client, 1.5ms vs 4.5ms -- throughput vs latency (graph) -- get number for commit overhead (worst case) - ...5-15% in workloads we tested - - -OBFS -- ext2 vs obfs -- single box thing -- read prefetch performance thing - -MACRO -- compile trace w/ read+write - - client latency!! - - - - - +- test exports w/ new capability stuff KNOWN BUGS to fix after fast - RDWR on synthetic client results in fakesyn: mds/MDS.cc:2334: void MDS::handle_client_close(MClientRequest*, CInode*): Assertion `cur->softlock.can_write(true)' failed. -- caps needs to be rewritten - - MDS currently not reclaiming fh's (otherwise a client assertion eventually fails) -- hard links! -- g_clock.gettimepair() all over the place.. need a get_recent()! -- the client request direction is sort of borken.. some reqs have to go to auth, others don't -- implement truncate() for real - - -big fast todo's: -- client buffer cache -- replication protocol -- heartbeat protocol - -- mds benchmarks - - pseudo-mega-filesystem - - osd copy-on-write.. - -- osd recovery structures (per-RG, etc) -- heartbeatmonitor vs pingmonitor? +- hard links! +- implement truncate() for real -ask tyce+bill: -- obfs on alc? +UPCOMING TODOS: +/- redo client capability stuff +- finish buffer cache +- hash directories!! -md tests: -- log length versus cache size, workload +- plan out osd replication, recovery structures +- redo CDir hash in terms of const char * in CDentry? +- what to do about fuse direct_io and mmap()? finish HARD LINKS @@ -89,8 +52,6 @@ OSD TODO - osd cluster expansion CLIENT TODO -- finish basic metadata cache (remove items...) -- block i/o on missing caps - buffer cache - strong consistency - get short-term leases from MDS (for stat etc) diff --git a/ceph/client/Buffercache.cc b/ceph/client/Buffercache.cc index 292e9aa7a9bad..43c5a93e85685 100644 --- a/ceph/client/Buffercache.cc +++ b/ceph/client/Buffercache.cc @@ -1,6 +1,6 @@ #include "Buffercache.h" -#include "include/config.h" +#include "config.h" #undef dout #define dout(l) if (l<=g_conf.debug) cout << "client" << "." << pthread_self() << " " @@ -51,7 +51,7 @@ void Bufferhead::alloc_buffers(size_t size) { dout(10) << "bc: allocating buffers size: " << size << endl; while (size > 0) { - if (size <= g_conf.client_bcache_alloc_maxsize) { + if (size <= (unsigned)g_conf.client_bcache_alloc_maxsize) { size_t k = g_conf.client_bcache_alloc_minsize; size_t asize = size - size % k + (size % k > 0) * k; buffer *b = new buffer(asize); @@ -165,7 +165,7 @@ void Bufferhead::claim_append(Bufferhead *other) void Dirtybuffers::erase(Bufferhead* bh) { dout(7) << "dirtybuffer: erase bh->ino: " << bh->ino << " offset: " << bh->offset << endl; - int osize = _dbufs.size(); + unsigned osize = _dbufs.size(); for (multimap::iterator it = _dbufs.lower_bound(bh->dirty_since); it != _dbufs.upper_bound(bh->dirty_since); it++) { diff --git a/ceph/client/Buffercache.h b/ceph/client/Buffercache.h index c83ff824192f1..8a3c0d0c6e704 100644 --- a/ceph/client/Buffercache.h +++ b/ceph/client/Buffercache.h @@ -7,7 +7,7 @@ #include "include/buffer.h" #include "include/bufferlist.h" #include "include/lru.h" -#include "include/config.h" +#include "config.h" #include "common/Cond.h" // stl @@ -34,12 +34,12 @@ class Bufferhead : public LRUObject { int get() { assert(ref >= 0); if (ref == 0) lru_pin(); - ref++; + return ++ref; } int put() { assert(ref > 0); - ref--; - if (ref == 0) lru_unpin(); + if (ref == 1) lru_unpin(); + return --ref; } off_t offset; diff --git a/ceph/client/Client.cc b/ceph/client/Client.cc index 9dde53e0af229..13ba15ad87b7a 100644 --- a/ceph/client/Client.cc +++ b/ceph/client/Client.cc @@ -7,6 +7,8 @@ #include #include #include +#include +#include #include "messages/MClientMount.h" #include "messages/MClientMountAck.h" @@ -20,7 +22,7 @@ #include "common/Mutex.h" #include "common/Logger.h" -#include "include/config.h" +#include "config.h" #undef dout #define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_client) cout << "client" << whoami << "." << pthread_self() << " " @@ -48,6 +50,9 @@ Client::Client(MDCluster *mdc, int id, Messenger *m) set_cache_size(g_conf.client_cache_size); + // file handles + free_fh_set.map_insert(10, 1<<30); + // set up messengers messenger = m; messenger->set_dispatcher(this); @@ -70,7 +75,7 @@ Client::~Client() void Client::tear_down_cache() { // fh's - for (hash_map::iterator it = fh_map.begin(); + for (hash_map::iterator it = fh_map.begin(); it != fh_map.end(); it++) { Fh *fh = it->second; @@ -82,7 +87,7 @@ void Client::tear_down_cache() // empty lru lru.lru_set_max(0); - int last = 0; + unsigned last = 0; while (lru.lru_get_size() != last) { last = lru.lru_get_size(); dout(10) << "trim pass, size is " << last << endl; @@ -111,6 +116,7 @@ void Client::dump_inode(Inode *in, set& did) if (in->dir) { dout(1) << " dir size " << in->dir->dentries.size() << endl; + //for (hash_map, eqstr>::iterator it = in->dir->dentries.begin(); for (hash_map::iterator it = in->dir->dentries.begin(); it != in->dir->dentries.end(); it++) { @@ -216,6 +222,14 @@ Inode* Client::insert_inode_info(Dir *dir, c_inode_info *in_info) // actually update info dn->inode->inode = in_info->inode; + + // or do we have newer size/mtime from writing? + if (dn->inode->file_caps & CAP_FILE_WR) { + if (dn->inode->file_wr_size > dn->inode->inode.size) + dn->inode->inode.size = dn->inode->file_wr_size; + if (dn->inode->file_wr_mtime > dn->inode->inode.mtime) + dn->inode->inode.mtime = dn->inode->file_wr_mtime; + } // symlink? if ((dn->inode->inode.mode & INODE_TYPE_MASK) == INODE_MODE_SYMLINK) { @@ -248,7 +262,7 @@ void Client::insert_trace(const vector& trace) if (trace.empty()) return; - for (int i=0; ifinc("lsum",(double)lat); client_logger->inc("lnum"); if (nojournal) { - client_logger->finc("lrsum",timepair_to_double(lat)); + client_logger->finc("lrsum",(double)lat); client_logger->inc("lrnum"); } else { - client_logger->finc("lwsum",timepair_to_double(lat)); + client_logger->finc("lwsum",(double)lat); client_logger->inc("lwnum"); } } @@ -514,9 +527,9 @@ void Client::flush_buffers(int ttl, size_t dirty_size) void Client::trim_bcache() { - if (bc.get_total_size() > g_conf.client_bcache_size) { + if (bc.get_total_size() > (unsigned) g_conf.client_bcache_size) { // need to free buffers - if (bc.get_dirty_size() > g_conf.client_bcache_hiwater * g_conf.client_bcache_size / 100) { + if (bc.get_dirty_size() > (unsigned)g_conf.client_bcache_hiwater * (unsigned)g_conf.client_bcache_size / 100UL) { // flush buffers until we have low water mark size_t want_target_size = (size_t) g_conf.client_bcache_lowater * g_conf.client_bcache_size / 100; flush_buffers(g_conf.client_bcache_ttl, want_target_size); @@ -544,62 +557,54 @@ void Client::release_inode_buffers(Inode *in) void Client::handle_file_caps(MClientFileCaps *m) { - Fh *f = fh_map[m->get_fh()]; - - if (f) { - // file is still open, we care + Inode *in = inode_map[ m->get_ino() ]; + assert(in); - // auth? - if (m->get_mds() >= 0) { - f->mds = m->get_mds(); - dout(5) << "handle_file_caps on fh " << m->get_fh() << " mds now " << f->mds << endl; - } + // auth? + if (m->get_mds() >= 0) { + in->file_mds = m->get_mds(); + dout(5) << "handle_file_caps on in " << m->get_ino() << " mds now " << in->file_mds << endl; + } - f->caps = m->get_caps(); - dout(5) << "handle_file_caps on fh " << m->get_fh() << " caps now " << f->caps << endl; + in->file_caps = m->get_caps(); + dout(5) << "handle_file_caps on in " << m->get_ino() << " caps now " << in->file_caps << endl; - // update inode - Inode *in = f->inode; - assert(in); - in->inode = m->get_inode(); // might have updated size... FIXME this is overkill! + // update inode + in->inode = m->get_inode(); // might have updated size... FIXME this is overkill! - // flush buffers? - if (f->caps & CFILE_CAP_WRBUFFER == 0) - flush_inode_buffers(in); - - // release buffers? - if (f->caps & CFILE_CAP_RDCACHE == 0) - release_inode_buffers(in); - - // ack - if (m->needs_ack()) { - dout(5) << "acking" << endl; - messenger->send_message(m, m->get_source(), m->get_source_port()); - return; - } + // flush buffers? + if (in->file_caps & CAP_FILE_WRBUFFER == 0) + flush_inode_buffers(in); - // wake up waiters? - if (f->caps & CFILE_CAP_RD) { - for (list::iterator it = in->waitfor_read.begin(); - it != in->waitfor_read.end(); - it++) { - dout(5) << "signaling read waiter " << *it << endl; - (*it)->Signal(); - } - in->waitfor_read.clear(); + // release buffers? + if (in->file_caps & CAP_FILE_RDCACHE == 0) + release_inode_buffers(in); + + // ack + if (m->needs_ack()) { + dout(5) << "acking" << endl; + messenger->send_message(m, m->get_source(), m->get_source_port()); + return; + } + + // wake up waiters? + if (in->file_caps & CAP_FILE_RD) { + for (list::iterator it = in->waitfor_read.begin(); + it != in->waitfor_read.end(); + it++) { + dout(5) << "signaling read waiter " << *it << endl; + (*it)->Signal(); } - if (f->caps & CFILE_CAP_WR) { - for (list::iterator it = in->waitfor_write.begin(); - it != in->waitfor_write.end(); - it++) { - dout(5) << "signaling write waiter " << *it << endl; - (*it)->Signal(); - } - in->waitfor_write.clear(); + in->waitfor_read.clear(); + } + if (in->file_caps & CAP_FILE_WR) { + for (list::iterator it = in->waitfor_write.begin(); + it != in->waitfor_write.end(); + it++) { + dout(5) << "signaling write waiter " << *it << endl; + (*it)->Signal(); } - - } else { - dout(5) << "handle_file_caps fh is closed or closing, dropping" << endl; + in->waitfor_write.clear(); } delete m; @@ -636,14 +641,17 @@ int Client::mount(int mkfs) client_lock.Unlock(); + /* dout(3) << "op: // client trace data structs" << endl; dout(3) << "op: struct stat st;" << endl; dout(3) << "op: struct utimbuf utim;" << endl; dout(3) << "op: int readlinkbuf_len = 1000;" << endl; dout(3) << "op: char readlinkbuf[readlinkbuf_len];" << endl; dout(3) << "op: map dir_contents;" << endl; - dout(3) << "op: map open_files;" << endl; - dout(3) << "op: fileh_t fh;" << endl; + dout(3) << "op: map open_files;" << endl; + dout(3) << "op: fh_t fh;" << endl; + */ + return 0; } int Client::unmount() @@ -664,6 +672,7 @@ int Client::unmount() delete reply; client_lock.Unlock(); + return 0; } @@ -890,7 +899,7 @@ int Client::readlink(const char *path, char *buf, size_t size) assert(in); // i just did a stat // copy into buf (at most size bytes) - int res = in->symlink->length(); + unsigned res = in->symlink->length(); if (res > size) res = size; memcpy(buf, in->symlink->c_str(), res); @@ -1179,6 +1188,17 @@ int Client::open(const char *path, int mode) tout << path << endl; tout << mode << endl; + int cmode = 0; + if (mode & O_WRONLY) + cmode = FILE_MODE_W; + else if (mode & O_RDWR) + cmode = FILE_MODE_RW; + else if (mode & O_APPEND) + cmode = FILE_MODE_W; + else + cmode = FILE_MODE_R; + + // go MClientRequest *req = new MClientRequest(MDS_OP_OPEN, whoami); req->set_path(path); req->set_iarg(mode); @@ -1197,34 +1217,48 @@ int Client::open(const char *path, int mode) int result = reply->get_result(); // success? - if (result > 0) { + fh_t fh = 0; + if (result >= 0) { // yay - fileh_t fh = reply->get_result(); // FIXME? Fh *f = new Fh; memset(f, 0, sizeof(*f)); - f->mds = reply->get_source(); - f->caps = reply->get_file_caps(); + f->mode = cmode; // inode f->inode = inode_map[trace[trace.size()-1]->inode.ino]; assert(f->inode); f->inode->get(); - + f->inode->file_mds = reply->get_source(); + + if (cmode & FILE_MODE_R) + f->inode->num_rd++; + if (cmode & FILE_MODE_W) + f->inode->num_wr++; + + // caps + if (f->inode->file_caps_seq == 0) + f->inode->get(); + f->inode->file_caps = reply->get_file_caps(); + assert(f->inode->file_caps_seq < reply->get_file_caps_seq()); // ordered delivery + f->inode->file_caps_seq = reply->get_file_caps_seq(); + // put in map + result = fh = get_fh(); assert(fh_map.count(fh) == 0); fh_map[fh] = f; - - dout(3) << "open success, fh is " << fh << " caps " << f->caps << " fh size " << f->size << endl; + + dout(3) << "open success, fh is " << fh << " caps " << f->inode->file_caps << endl;//f->caps << " fh size " << f->size << endl; } delete reply; trim_cache(); client_lock.Unlock(); + return result; } -int Client::close(fileh_t fh) +int Client::close(fh_t fh) { client_lock.Lock(); dout(3) << "op: client->close(open_files[ " << fh << " ]);" << endl; @@ -1232,28 +1266,21 @@ int Client::close(fileh_t fh) tout << "close" << endl; tout << fh << endl; - + // get Fh, Inode assert(fh_map.count(fh)); Fh *f = fh_map[fh]; Inode *in = f->inode; - MClientRequest *req = new MClientRequest(MDS_OP_CLOSE, whoami); - req->set_iarg(fh); - req->set_ino(in->inode.ino); - - req->set_targ( f->mtime ); - req->set_sizearg( f->size ); - - // FIXME where does FUSE maintain user information - req->set_caller_uid(getuid()); - req->set_caller_gid(getgid()); - // Make sure buffers are all clean! //flush_inode_buffers(in); - - // take note of the fact that we're mid-close - /* mds may ack our close() after reissuing same fh to another open; remove from - fh_map _before_ sending request. */ + + // update inode + if (f->mode & FILE_MODE_R) + in->num_rd--; + if (f->mode & FILE_MODE_W) + in->num_wr--; + + // hose fh fh_map.erase(fh); delete f; @@ -1262,13 +1289,44 @@ int Client::close(fileh_t fh) //release_inode_buffers(in); put_inode( in ); + int result = 0; + + // release caps right away? + if (in->num_rd == 0 && + in->num_wr == 0) { + // synchronously; FIXME this is dumb + + MClientRequest *req = new MClientRequest(MDS_OP_RELEASE, whoami); + req->set_ino(in->inode.ino); + + req->set_iarg( in->file_caps_seq ); + req->set_targ( in->file_wr_mtime ); + req->set_sizearg( in->file_wr_size ); + + // FIXME where does FUSE maintain user information + req->set_caller_uid(getuid()); + req->set_caller_gid(getgid()); + + MClientReply *reply = make_request(req, true, mds_auth); + assert(reply); + int result = reply->get_result(); + assert(result == 0); + + // success? + if (in->file_caps_seq == reply->get_file_caps_seq()) { + // yup. + dout(5) << "successfully released caps" << endl; + in->file_caps_seq = 0; + in->file_caps = 0; + in->file_wr_mtime = 0; + in->file_wr_size = 0; + put_inode(in); + } else { + dout(5) << "failed to release caps; i had " << in->file_caps_seq << " mds had " << reply->get_file_caps_seq() << endl; + } - MClientReply *reply = make_request(req, true, mds_auth); - assert(reply); - int result = reply->get_result(); - dout(3) << "close " << fh << " result = " << result << endl; - - delete reply; + delete reply; + } client_lock.Unlock(); return result; @@ -1325,7 +1383,7 @@ public: }; -int Client::read(fileh_t fh, char *buf, size_t size, off_t offset) +int Client::read(fh_t fh, char *buf, size_t size, off_t offset) { client_lock.Lock(); @@ -1340,22 +1398,35 @@ int Client::read(fileh_t fh, char *buf, size_t size, off_t offset) Inode *in = f->inode; // do we have read file cap? - while (f->caps & CFILE_CAP_RD == 0) { + while (in->file_caps & CAP_FILE_RD == 0) { dout(7) << " don't have read cap, waiting" << endl; Cond cond; in->waitfor_read.push_back(&cond); cond.Wait(client_lock); } - - + + // determine whether read range overlaps with file - // FIXME: maybe we should stat the file again? - dout(10) << "file size: " << in->inode.size << endl; - if (offset >= in->inode.size) { - client_lock.Unlock(); - return 0; + // ...ONLY if we're doing async io + if (in->file_caps & (CAP_FILE_WRBUFFER|CAP_FILE_RDCACHE)) { + // we're doing buffered i/o. make sure we're inside the file. + // we can trust size info bc we get accurate info when buffering/caching caps are issued. + dout(10) << "file size: " << in->inode.size << endl; + if (offset > 0 && (size_t)offset >= in->inode.size) { + client_lock.Unlock(); + return 0; + } + if (size > in->inode.size) size = in->inode.size; + + if (size == 0) { + dout(10) << "read is size=0, returning 0" << endl; + client_lock.Unlock(); + return 0; + } + } else { + // unbuffered, synchronous file i/o. + // defer to OSDs for file bounds. } - if (size > in->inode.size) size = in->inode.size; int rvalue = 0; @@ -1392,7 +1463,8 @@ int Client::read(fileh_t fh, char *buf, size_t size, off_t offset) assert(rvalue > 0); dout(7) << "read bc hit: immediately returning " << rvalue << " bytes" << endl; } - assert(!(rvalue == size) || holes.empty()); + assert(!(rvalue >= 0 && (size_t)rvalue == size) || holes.empty()); + // issue reads for holes int hole_rvalue = 0; //FIXME: don't really need to track rvalue in MissFinish context for (hole = holes.begin(); hole != holes.end(); hole++) { @@ -1469,7 +1541,7 @@ public: }; -int Client::write(fileh_t fh, const char *buf, size_t size, off_t offset) +int Client::write(fh_t fh, const char *buf, size_t size, off_t offset) { client_lock.Lock(); @@ -1484,11 +1556,11 @@ int Client::write(fileh_t fh, const char *buf, size_t size, off_t offset) Fh *f = fh_map[fh]; Inode *in = f->inode; - dout(10) << "cur file size is " << in->inode.size << " fh size " << f->size << endl; + dout(10) << "cur file size is " << in->inode.size << " wr size " << in->file_wr_size << endl; // do we have write file cap? - while (f->caps & CFILE_CAP_WR == 0) { + while (in->file_caps & CAP_FILE_WR == 0) { dout(7) << " don't have write cap, waiting" << endl; Cond cond; in->waitfor_write.push_back(&cond); @@ -1497,7 +1569,7 @@ int Client::write(fileh_t fh, const char *buf, size_t size, off_t offset) if (g_conf.client_bcache && // buffer cache ON? - f->caps & CFILE_CAP_WRBUFFER) { // caps buffered write? + in->file_caps & CAP_FILE_WRBUFFER) { // caps buffered write? // buffered write dout(7) << "buffered/async write" << endl; @@ -1565,16 +1637,15 @@ int Client::write(fileh_t fh, const char *buf, size_t size, off_t offset) size_t totalwritten = size; // extend file? - if (totalwritten + offset > f->size) { - f->size = totalwritten + offset; - in->inode.size = f->size; + if (totalwritten + (size_t)offset > in->inode.size) { + in->inode.size = in->file_wr_size = totalwritten + offset; dout(7) << "wrote to " << totalwritten+offset << ", extending file size" << endl; } else { - dout(7) << "wrote to " << totalwritten+offset << ", leaving file size at " << f->size << endl; + dout(7) << "wrote to " << totalwritten+offset << ", leaving file size at " << in->inode.size << endl; } // mtime - f->mtime = in->inode.mtime = g_clock.gettime(); + in->file_wr_mtime = in->inode.mtime = g_clock.gettime(); // ok! client_lock.Unlock(); @@ -1611,7 +1682,7 @@ int Client::truncate(const char *file, off_t size) } -int Client::fsync(fileh_t fh, bool syncdataonly) +int Client::fsync(fh_t fh, bool syncdataonly) { client_lock.Lock(); dout(3) << "op: client->fsync(open_files[ " << fh << " ], " << syncdataonly << ");" << endl; @@ -1630,7 +1701,7 @@ int Client::fsync(fileh_t fh, bool syncdataonly) flush_inode_buffers(in); if (syncdataonly && - (f->caps & CFILE_CAP_WR)) { + (in->file_caps & CAP_FILE_WR)) { // flush metadata too.. size, mtime // ... } @@ -1642,4 +1713,8 @@ int Client::fsync(fileh_t fh, bool syncdataonly) // not written yet, but i want to link! -int Client::statfs(const char *path, struct statfs *stbuf) {} +int Client::statfs(const char *path, struct statfs *stbuf) +{ + assert(0); // implement me + return 0; +} diff --git a/ceph/client/Client.h b/ceph/client/Client.h index 169562514d079..53784db493818 100644 --- a/ceph/client/Client.h +++ b/ceph/client/Client.h @@ -19,6 +19,7 @@ #include "include/types.h" #include "include/lru.h" #include "include/filepath.h" +#include "include/rangeset.h" #include "common/Mutex.h" @@ -50,12 +51,15 @@ extern class Logger *client_logger; */ +typedef int fh_t; + class Dir; class Inode; class Dentry : public LRUObject { public: string name; // sort of lame + //const char *name; Dir *dir; Inode *inode; int ref; // 1 if there's a dir beneath me. @@ -63,18 +67,26 @@ class Dentry : public LRUObject { void get() { assert(ref == 0); ref++; lru_pin(); } void put() { assert(ref == 1); ref--; lru_unpin(); } - Dentry() : ref(0), dir(0), inode(0) { } + Dentry() : dir(0), inode(0), ref(0) { } + /*Dentry() : name(0), dir(0), inode(0), ref(0) { } + Dentry(string& n) : name(0), dir(0), inode(0), ref(0) { + name = new char[n.length()+1]; + strcpy((char*)name, n.c_str()); + } + ~Dentry() { + delete[] name; + }*/ }; class Dir { public: Inode *parent_inode; // my inode - hash_map< string, Dentry* > dentries; + //hash_map, eqstr> dentries; + hash_map dentries; Dir(Inode* in) { parent_inode = in; } bool is_empty() { return dentries.empty(); } - }; @@ -86,6 +98,13 @@ class Inode { set mds_contacts; time_t last_updated; + int file_caps; + long file_caps_seq; + int file_mds; // semi-hack + time_t file_wr_mtime; // [writers] time of last write + size_t file_wr_size; // [writers] largest offset we've written to + int num_rd, num_wr; // num readers, writers + int ref; // ref count. 1 for each dentry, fh that links to me. Dir *dir; // if i'm a dir. Dentry *dn; // if i'm linked to a dentry. @@ -99,7 +118,9 @@ class Inode { void get() { ref++; } void put() { ref--; assert(ref >= 0); } - Inode() : ref(0), dir(0), dn(0), symlink(0), mds_dir_auth(-1), last_updated(0) { } + Inode() : mds_dir_auth(-1), last_updated(0), + file_caps(0), file_caps_seq(0), num_rd(0), num_wr(0), + ref(0), dir(0), dn(0), symlink(0) { } ~Inode() { if (symlink) { delete symlink; symlink = 0; } } @@ -147,21 +168,11 @@ class Inode { // file handle for any open file state -#define FH_STATE_RDONLY 1 // all readers, cache at will (no write) -#define FH_STATE_WRONLY 2 // all writers, buffer at will (no read) -#define FH_STATE_RDWR 3 // read+write synchronously -#define FH_STATE_LOCK 4 // no read or write struct Fh { - //inodeno_t ino; Inode *inode; int mds; // have to talk to mds we opened with (for now) - int mode; // the mode i opened the file with - int caps; // my capabilities (read, read+cache, write, write+buffer) - - time_t mtime; // [writers] time of last write - size_t size; // [writers] largest offset we've written to }; @@ -190,7 +201,18 @@ class Client : public Dispatcher { LRU lru; // lru list of Dentry's in our local metadata cache. // file handles - hash_map fh_map; + rangeset free_fh_set; // unused fh's + hash_map fh_map; + + fh_t get_fh() { + fh_t fh = free_fh_set.first(); + free_fh_set.erase(fh); + return fh; + } + void put_fh(fh_t fh) { + free_fh_set.insert(fh); + } + // global (client) lock Mutex client_lock; @@ -230,7 +252,7 @@ class Client : public Dispatcher { // link to dir dn->dir = dir; - dir->dentries[name] = dn; + dir->dentries[dn->name] = dn; // link to inode dn->inode = in; @@ -262,10 +284,17 @@ class Client : public Dispatcher { Dentry *relink(Dentry *dn, Dir *dir, string& name) { // first link new dn to dir + /* + char *oldname = (char*)dn->name; + dn->name = new char[name.length()+1]; + strcpy((char*)dn->name, name.c_str()); + dir->dentries[dn->name] = dn; + */ dir->dentries[name] = dn; - + // unlink from old dir dn->dir->dentries.erase(dn->name); + //delete[] oldname; if (dn->dir->is_empty()) close_dir(dn->dir); @@ -351,12 +380,12 @@ class Client : public Dispatcher { // file ops int mknod(const char *path, mode_t mode); int open(const char *path, int mode); - int close(fileh_t fh); - int read(fileh_t fh, char *buf, size_t size, off_t offset); - int write(fileh_t fh, const char *buf, size_t size, off_t offset); + int close(fh_t fh); + int read(fh_t fh, char *buf, size_t size, off_t offset); + int write(fh_t fh, const char *buf, size_t size, off_t offset); int truncate(const char *file, off_t size); - //int truncate(fileh_t fh, off_t size); - int fsync(fileh_t fh, bool syncdataonly); + //int truncate(fh_t fh, off_t size); + int fsync(fh_t fh, bool syncdataonly); }; diff --git a/ceph/client/SyntheticClient.cc b/ceph/client/SyntheticClient.cc index 5abc34ccc3827..8ae4e2e47835b 100644 --- a/ceph/client/SyntheticClient.cc +++ b/ceph/client/SyntheticClient.cc @@ -12,7 +12,7 @@ #include #include -#include "include/config.h" +#include "config.h" #undef dout #define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_client) cout << "synthetic" << client->get_nodeid() << " " @@ -48,8 +48,8 @@ string SyntheticClient::get_sarg() int SyntheticClient::run() { - run_start = g_clock.gettimepair(); - run_until = timepair_t(0,0); + run_start = g_clock.now(); + run_until = utime_t(0,0); for (list::iterator it = modes.begin(); it != modes.end(); @@ -73,11 +73,11 @@ int SyntheticClient::run() iargs.pop_front(); if (iarg1) { dout(2) << "until " << iarg1 << endl; - timepair_t dur(iarg1,0); + utime_t dur(iarg1,0); run_until = run_start + dur; } else { dout(2) << "until " << iarg1 << " (no limit)" << endl; - run_until = timepair_t(0,0); + run_until = utime_t(0,0); } } break; @@ -148,22 +148,22 @@ int SyntheticClient::run() client->mkdir(prefix.c_str(), 0755); for (int i=0; i 0 && i < iarg1-1 ) { - client_logger->finc("trsum", timepair_to_double(lat)); + client_logger->finc("trsum", (double)lat); client_logger->inc("trnum"); } } @@ -244,6 +244,7 @@ int SyntheticClient::start_thread() pthread_create(&thread_id, NULL, synthetic_client_thread_entry, this); assert(thread_id); + return 0; } int SyntheticClient::join_thread() @@ -251,6 +252,7 @@ int SyntheticClient::join_thread() assert(thread_id); void *rv; pthread_join(thread_id, &rv); + return 0; } @@ -286,7 +288,7 @@ void SyntheticClient::init_op_dist() //op_dist.add( MDS_OP_WRITE, g_conf.fakeclient_op_write ); op_dist.add( MDS_OP_TRUNCATE, g_conf.fakeclient_op_truncate ); op_dist.add( MDS_OP_FSYNC, g_conf.fakeclient_op_fsync ); - op_dist.add( MDS_OP_CLOSE, g_conf.fakeclient_op_close ); + op_dist.add( MDS_OP_RELEASE, g_conf.fakeclient_op_close ); // actually, close() op_dist.normalize(); } @@ -303,7 +305,7 @@ int SyntheticClient::play_trace(Trace& t, string& prefix) dout(4) << "play trace" << endl; t.start(); - timepair_t start = g_clock.gettimepair(); + utime_t start = g_clock.now(); const char *p = prefix.c_str(); @@ -374,8 +376,6 @@ int SyntheticClient::play_trace(Trace& t, string& prefix) } else if (strcmp(op, "open") == 0) { const char *a = t.get_string(p); __int64_t b = t.get_int(); - // HACK - b = O_RDONLY; __int64_t id = t.get_int(); __int64_t fh = client->open(a, b); open_files[id] = fh; @@ -417,8 +417,8 @@ int SyntheticClient::play_trace(Trace& t, string& prefix) dout(1) << "leftover close " << fi->second << endl; if (fi->second > 0) client->close(fi->second); } - + return 0; } @@ -531,7 +531,7 @@ int SyntheticClient::write_file(string& fn, int size, int wrsize) // size is i dout(5) << "writing to " << fn << " fd " << fd << endl; if (fd < 0) return fd; - for (int i=0; iwrite(fd, buf, wrsize, i*wrsize); @@ -539,6 +539,8 @@ int SyntheticClient::write_file(string& fn, int size, int wrsize) // size is i client->close(fd); delete[] buf; + + return 0; } int SyntheticClient::read_file(string& fn, int size, int rdsize) // size is in MB, wrsize in bytes @@ -551,7 +553,7 @@ int SyntheticClient::read_file(string& fn, int size, int rdsize) // size is in dout(5) << "reading from " << fn << " fd " << fd << endl; if (fd < 0) return fd; - for (int i=0; iread(fd, buf, rdsize, i*rdsize); @@ -559,6 +561,8 @@ int SyntheticClient::read_file(string& fn, int size, int rdsize) // size is in client->close(fd); delete[] buf; + + return 0; } @@ -577,14 +581,14 @@ int SyntheticClient::random_walk(int num_req) if (time_to_stop()) break; // ascend? - if (cwd.depth() && !roll_die(pow(.9, cwd.depth()))) { + if (cwd.depth() && !roll_die(::pow((double).9, (double)cwd.depth()))) { dout(DBL) << "die says up" << endl; up(); continue; } // descend? - if (.9*roll_die(pow(.9,cwd.depth())) && subdirs.size()) { + if (.9*roll_die(::pow((double).9,(double)cwd.depth())) && subdirs.size()) { string s = get_random_subdir(); cwd.add_dentry( s ); dout(DBL) << "cd " << s << " -> " << cwd << endl; @@ -679,7 +683,7 @@ int SyntheticClient::random_walk(int num_req) } } - if (op == MDS_OP_CLOSE) { + if (op == MDS_OP_RELEASE) { // actually, close if (open_files.empty()) op = MDS_OP_STAT; else { diff --git a/ceph/client/SyntheticClient.h b/ceph/client/SyntheticClient.h index a9c093ec29cb9..ecc5b2de60b5c 100644 --- a/ceph/client/SyntheticClient.h +++ b/ceph/client/SyntheticClient.h @@ -111,13 +111,13 @@ class SyntheticClient { list modes; list sargs; list iargs; - timepair_t run_start; - timepair_t run_until; + utime_t run_start; + utime_t run_until; string get_sarg(); bool time_to_stop() { - if (run_until.first && g_clock.gettimepair() > run_until) + if (run_until.sec() && g_clock.now() > run_until) return true; else return false; diff --git a/ceph/client/Trace.cc b/ceph/client/Trace.cc index 30d2a09af0017..c35e5da86671e 100644 --- a/ceph/client/Trace.cc +++ b/ceph/client/Trace.cc @@ -9,7 +9,7 @@ using namespace __gnu_cxx; #include "common/Mutex.h" -#include "include/config.h" +#include "config.h" #include #include diff --git a/ceph/client/fuse.cc b/ceph/client/fuse.cc index add7bcb2527fa..77b0d66bc081c 100644 --- a/ceph/client/fuse.cc +++ b/ceph/client/fuse.cc @@ -31,7 +31,7 @@ #include "Client.h" -#include "include/config.h" +#include "config.h" // stl #include @@ -41,19 +41,6 @@ using namespace std; // globals Client *client; // the ceph client -// fh fun -Mutex pfh_lock; -rangeset pfh_set; // available pseudo_fh's -map pfh_map; // map pseudo-fh -> fh - -int get_pseudo_fh() { - int fh = pfh_set.first(); - pfh_set.erase(fh); - return fh; -} -void put_pseudo_fh(int fh) { - pfh_set.insert(fh); -} // ------ @@ -158,44 +145,28 @@ static int ceph_open(const char *path, struct fuse_file_info *fi) res = client->open(path, fi->flags); if (res < 0) return res; - - /*pfh_lock.Lock(); - pfh_t pfh = get_pseudo_fh(); - pfh_map[pfh] = res; - fi->fh = pfh; - pfh_lock.Unlock(); - */ fi->fh = res; - return 0; // fuse wants 0 onsucess } static int ceph_read(const char *path, char *buf, size_t size, off_t offset, struct fuse_file_info *fi) { - /*pfh_lock.Lock(); - fileh_t fh = pfh_map[fi->fh]; - pfh_lock.Unlock(); - */ - fileh_t fh = fi->fh; + fh_t fh = fi->fh; return client->read(fh, buf, size, offset); } static int ceph_write(const char *path, const char *buf, size_t size, off_t offset, struct fuse_file_info *fi) { - /*pfh_lock.Lock(); - fileh_t fh = pfh_map[fi->fh]; - pfh_lock.Unlock(); - */ - fileh_t fh = fi->fh; + fh_t fh = fi->fh; return client->write(fh, buf, size, offset); } /* static int ceph_flush(const char *path, struct fuse_file_info *fi) { - fileh_t fh = fi->fh; + fh_t fh = fi->fh; return client->flush(fh); } */ @@ -209,13 +180,7 @@ static int ceph_statfs(const char *path, struct statfs *stbuf) static int ceph_release(const char *path, struct fuse_file_info *fi) { - /*pfh_lock.Lock(); - fileh_t fh = fi->fh; - pfh_map.erase(fi->fh); - put_pseudo_fh(fi->fh); - pfh_lock.Unlock(); - */ - fileh_t fh = fi->fh; + fh_t fh = fi->fh; int r = client->close(fh); // close the file return r; } @@ -223,7 +188,7 @@ static int ceph_release(const char *path, struct fuse_file_info *fi) static int ceph_fsync(const char *path, int isdatasync, struct fuse_file_info *fi) { - fileh_t fh = fi->fh; + fh_t fh = fi->fh; return client->fsync(fh, isdatasync ? true:false); } @@ -258,9 +223,6 @@ int ceph_fuse_main(Client *c, int argc, char *argv[]) // init client client = c; - // init fh allocator - pfh_set.map_insert(10,65000); - // set up fuse argc/argv int newargc = 0; char **newargv = (char **) malloc((argc + 10) * sizeof(char *)); diff --git a/ceph/common/Clock.cc b/ceph/common/Clock.cc new file mode 100644 index 0000000000000..1d77cce1528ab --- /dev/null +++ b/ceph/common/Clock.cc @@ -0,0 +1,5 @@ +#include "Clock.h" + +// public +Clock g_clock; + diff --git a/ceph/common/Clock.h b/ceph/common/Clock.h index 92924b39d43b9..12e2dac48e40c 100644 --- a/ceph/common/Clock.h +++ b/ceph/common/Clock.h @@ -2,79 +2,127 @@ #ifndef __CLOCK_H #define __CLOCK_H +#include +#include + #include #include #include using namespace std; -// --- time stuff --- -typedef pair timepair_t; // struct timeval is a PITA -// addition, subtraction -inline timepair_t operator+(timepair_t& l, timepair_t& r) { - return timepair_t( l.first + r.first + (l.second+r.second)/1000000L, - (l.second+r.second)%1000000L ); +// -------- +// utime_t + +class utime_t { + private: + struct timeval tv; + + public: + // cons + utime_t() { tv.tv_sec = 0; tv.tv_usec = 0; } + utime_t(time_t s, int u) { tv.tv_sec = s; tv.tv_usec = u; } + + // accessors + time_t sec() const { return tv.tv_sec; } + __suseconds_t usec() const { return tv.tv_usec; } + int nsec() const { return tv.tv_usec*1000; } + + // ref accessors/modifiers + time_t& sec_ref() { return tv.tv_sec; } + __suseconds_t& usec_ref() { return tv.tv_usec; } + + struct timeval& timeval() { return tv; } + + // cast to double + operator double() { + return (double)sec() + ((double)usec() / 1000000.0L); + } +}; + +// arithmetic operators +inline utime_t operator+(const utime_t& l, const utime_t& r) { + return utime_t( l.sec() + r.sec() + (l.usec()+r.usec())/1000000L, + (l.usec()+r.usec())%1000000L ); } -inline timepair_t& operator+=(timepair_t& l, timepair_t& r) { - l.first += r.first + (l.second+r.second)/1000000L; - l.second += r.second; - l.second %= 1000000L; +inline utime_t& operator+=(utime_t& l, const utime_t& r) { + l.sec_ref() += r.sec() + (l.usec()+r.usec())/1000000L; + l.usec_ref() += r.usec(); + l.usec_ref() %= 1000000L; return l; } -inline timepair_t operator-(timepair_t& l, timepair_t& r) { - return timepair_t( l.first - r.first - (l.second= r.second) - l.second -= r.second; +inline utime_t& operator-=(utime_t& l, const utime_t& r) { + l.sec_ref() -= r.sec(); + if (l.usec() >= r.usec()) + l.usec_ref() -= r.usec(); else { - l.second += 1000000L - r.second; - l.first--; + l.usec_ref() += 1000000L - r.usec(); + l.sec_ref()--; } + return l; } -inline double timepair_to_double(const timepair_t &t) { - return (double)t.first + ((double)t.second / 1000000.0L); + +inline bool operator>(const utime_t& a, const utime_t& b) +{ + return (a.sec() > b.sec()) || (a.sec() == b.sec() && a.usec() > b.usec()); +} +inline bool operator<(const utime_t& a, const utime_t& b) +{ + return (a.sec() < b.sec()) || (a.sec() == b.sec() && a.usec() < b.usec()); } +// ostream +inline ostream& operator<<(ostream& out, const utime_t& t) +{ + //return out << t.sec() << "." << t.usec(); + out << (long)t.sec() << "."; + out.setf(ios::right); + out.fill('0'); + out << setw(6) << t.usec(); + out.unsetf(ios::right); + return out; + + //return out << (long)t.sec << "." << ios::setf(ios::right) << ios::fill('0') << t.usec() << ios::usetf(); +} + + + // -- clock -- class Clock { protected: - struct timeval faketime; // if we're faking. - struct timeval start_offset; // time of process startup. - + //utime_t start_offset; + //utime_t abs_last; + utime_t last; public: - Clock(); - - time_t gettime(struct timeval *ts=0); - timepair_t gettimepair(); - - void sub(struct timeval *a, struct timeval *b) { - a->tv_sec -= b->tv_sec; - - if (a->tv_usec - b->tv_usec >= 0) - a->tv_usec -= b->tv_usec; - else { // borrow from seconds - a->tv_usec = a->tv_usec + 1000000 - b->tv_usec; - a->tv_sec--; - } + Clock() { + // set offset + //start_offset = now(); } - - - void add(struct timeval *a, struct timeval *b) { - a->tv_sec += b->tv_sec; - a->tv_usec += b->tv_usec; - if (a->tv_usec > 1000000) { - a->tv_sec++; - a->tv_usec -= 1000000; - } + + // relative time (from startup) + const utime_t& now() { + gettimeofday(&last.timeval(), NULL); + //last = abs_last - start_offset; + return last; + } + + const utime_t& recent_now() { + return last; + } + + // absolute time + time_t gettime() { + now(); + return last.sec(); } - }; diff --git a/ceph/common/Cond.h b/ceph/common/Cond.h index 73a851381741c..76c3ff01cad47 100644 --- a/ceph/common/Cond.h +++ b/ceph/common/Cond.h @@ -34,14 +34,14 @@ class Cond int Wait(Mutex &mutex, struct timeval *tv) { - Wait(mutex, timepair_t(tv->tv_sec, tv->tv_usec)); + return Wait(mutex, utime_t(tv->tv_sec, tv->tv_usec)); } int Wait(Mutex &mutex, - timepair_t when) { + utime_t when) { // timeval -> timespec struct timespec ts; - ts.tv_sec = when.first; - ts.tv_nsec = when.second*1000; + ts.tv_sec = when.sec(); + ts.tv_nsec = when.nsec(); int r = pthread_cond_timedwait(&C, &mutex.M, &ts); return r; } diff --git a/ceph/common/DecayCounter.h b/ceph/common/DecayCounter.h index fe6f6fce4a7b6..709458a71a5c7 100644 --- a/ceph/common/DecayCounter.h +++ b/ceph/common/DecayCounter.h @@ -5,7 +5,7 @@ #include #include "Clock.h" -#include "include/config.h" +#include "config.h" class DecayCounter { protected: @@ -14,7 +14,7 @@ class DecayCounter { double half_life; // in seconds double k; // k = ln(.5)/half_life - timepair_t last_decay; // time of last decay + utime_t last_decay; // time of last decay public: DecayCounter() : val(0) { @@ -26,8 +26,8 @@ class DecayCounter { reset(); } - void adjust(const timepair_t& now, double a) { - decay(now); + void adjust(double a) { + decay(); val += a; } void adjust_down(const DecayCounter& other) { @@ -46,27 +46,28 @@ class DecayCounter { } void reset() { - last_decay = timepair_t(0,0); + last_decay.sec_ref() = 0; + last_decay.usec_ref() = 0; val = 0; } - void decay(const timepair_t& now) { - timepair_t el = now; + void decay() { + utime_t el = g_clock.recent_now(); el -= last_decay; - if (el.first > 1) { - val = val * exp(timepair_to_double(el) * k); + if (el.sec() >= 1) { + val = val * exp((double)el * k); if (val < .01) val = 0; - last_decay = now; + last_decay = g_clock.recent_now(); } } - double get(const timepair_t& now) { - decay(now); + double get() { + decay(); return val; } - double hit(const timepair_t& now, double v = 1.0) { - decay(now); + double hit(double v = 1.0) { + decay(); val += v; return val; } diff --git a/ceph/common/LogType.h b/ceph/common/LogType.h index 10f601c706037..f5abd2eefcada 100644 --- a/ceph/common/LogType.h +++ b/ceph/common/LogType.h @@ -11,15 +11,6 @@ using namespace __gnu_cxx; #include "Mutex.h" -// for const char* comparisons -struct ltstr -{ - bool operator()(const char* s1, const char* s2) const - { - return strcmp(s1, s2) < 0; - } -}; - class LogType { protected: set keyset; diff --git a/ceph/common/Logger.cc b/ceph/common/Logger.cc index be27dd239a1b5..f39aff788fabf 100644 --- a/ceph/common/Logger.cc +++ b/ceph/common/Logger.cc @@ -7,7 +7,7 @@ #include #include "Clock.h" -#include "include/config.h" +#include "config.h" #include #include @@ -27,7 +27,7 @@ Logger::Logger(string fn, LogType *type) filename += fn; //cout << "log " << filename << endl; interval = g_conf.log_interval; - start = g_clock.gettimepair(); // time 0! + start = g_clock.now(); // time 0! last_logged = 0; wrote_header = -1; open = false; @@ -117,11 +117,11 @@ void Logger::flush(bool force) //cout << "opening log file " << filename << endl; } - timepair_t now = g_clock.gettimepair(); - timepair_t fromstart = now - start; + utime_t fromstart = g_clock.recent_now(); + fromstart -= start; while (force || - fromstart.first - last_logged >= interval) { + fromstart.sec() - last_logged >= interval) { last_logged += interval; force = false; diff --git a/ceph/common/Logger.h b/ceph/common/Logger.h index b90b8a453a71f..5bfa8a5add799 100644 --- a/ceph/common/Logger.h +++ b/ceph/common/Logger.h @@ -15,13 +15,6 @@ using namespace __gnu_cxx; #include "LogType.h" -struct eqstr -{ - bool operator()(const char* s1, const char* s2) const - { - return strcmp(s1, s2) == 0; - } -}; class Logger { @@ -32,7 +25,7 @@ class Logger { //Mutex lock; LogType *type; - timepair_t start; + utime_t start; int last_logged; int interval; int wrote_header; diff --git a/ceph/common/Mutex.h b/ceph/common/Mutex.h index 46303d365551a..a7b4faad6bc36 100755 --- a/ceph/common/Mutex.h +++ b/ceph/common/Mutex.h @@ -26,7 +26,7 @@ class Mutex pthread_mutexattr_t attr; pthread_mutexattr_init(&attr); pthread_mutexattr_settype(&attr,PTHREAD_MUTEX_RECURSIVE); - int r = pthread_mutex_init(&M,&attr); + pthread_mutex_init(&M,&attr); //cout << this << " mutex init = " << r << endl; pthread_mutexattr_destroy(&attr); } diff --git a/ceph/common/ThreadPool.h b/ceph/common/ThreadPool.h index 9142341571dbb..17670acfbe141 100644 --- a/ceph/common/ThreadPool.h +++ b/ceph/common/ThreadPool.h @@ -9,7 +9,7 @@ // debug output -#include "include/config.h" +#include "config.h" #define tpdout(x) if (x <= g_conf.debug) cout << myname << " " #define DBLVL 10 @@ -36,6 +36,7 @@ class ThreadPool { { ThreadPool *t = (ThreadPool *)arg; t->do_ops(arg); + return 0; } void * do_ops(void *nothing) @@ -55,6 +56,7 @@ class ThreadPool { //tpdout(DBLVL) << "Thread "<< pthread_self() << " calling the function\n"; func(u, op); } + return 0; } diff --git a/ceph/common/Timer.cc b/ceph/common/Timer.cc index 4e1a90c576c77..324e11a59377d 100644 --- a/ceph/common/Timer.cc +++ b/ceph/common/Timer.cc @@ -3,7 +3,7 @@ #include "Timer.h" #include "Cond.h" -#include "include/config.h" +#include "config.h" #include "include/Context.h" #include "msg/Messenger.h" @@ -23,11 +23,6 @@ Timer g_timer; Context *messenger_kicker = 0; -ostream& operator<<(ostream& out, timepair_t& t) -{ - return out << t.first << "." << t.second; -} - /**** thread solution *****/ @@ -45,19 +40,19 @@ void Timer::timer_thread() while (!thread_stop) { // now - timepair_t now = g_clock.gettimepair(); + utime_t now = g_clock.now(); // any events due? - timepair_t next; + utime_t next; Context *event = get_next_scheduled(next); if (event && now > next) { // move to pending list - map< timepair_t, set >::iterator it = scheduled.begin(); + map< utime_t, set >::iterator it = scheduled.begin(); while (it != scheduled.end()) { if (it->first > now) break; - timepair_t t = it->first; + utime_t t = it->first; dout(DBL) << "queuing event(s) scheduled at " << t << endl; pending[t] = it->second; @@ -150,12 +145,12 @@ void Timer::cancel_timer() void Timer::add_event_after(float seconds, Context *callback) { - timepair_t when = g_clock.gettimepair(); - when.first += (int)seconds; + utime_t when = g_clock.now(); + when.sec_ref() += (int)seconds; add_event_at(when, callback); } -void Timer::add_event_at(timepair_t when, +void Timer::add_event_at(utime_t when, Context *callback) { // insert @@ -183,7 +178,7 @@ bool Timer::cancel_event(Context *callback) return false; // wasn't scheduled. } - timepair_t tp = event_times[callback]; + utime_t tp = event_times[callback]; event_times.erase(callback); scheduled.erase(tp); @@ -204,7 +199,7 @@ void Timer::execute_pending() lock.Lock(); while (pending.size()) { - timepair_t when; + utime_t when; Context *event = take_next_pending(when); lock.Unlock(); diff --git a/ceph/common/Timer.h b/ceph/common/Timer.h index 2e3bc5a7273c9..1c8f535980e09 100644 --- a/ceph/common/Timer.h +++ b/ceph/common/Timer.h @@ -22,24 +22,24 @@ class Messenger; class Timer { private: - map< timepair_t, set > scheduled; // time -> (context ...) - map< timepair_t, set > pending; // time -> (context ...) - map< Context*, timepair_t > event_times; // event -> time + map< utime_t, set > scheduled; // time -> (context ...) + map< utime_t, set > pending; // time -> (context ...) + map< Context*, utime_t > event_times; // event -> time // get time of the next event - Context* get_next_scheduled(timepair_t& when) { + Context* get_next_scheduled(utime_t& when) { if (scheduled.empty()) return 0; - map< timepair_t, set >::iterator it = scheduled.begin(); + map< utime_t, set >::iterator it = scheduled.begin(); when = it->first; set::iterator sit = it->second.begin(); return *sit; } // get next pending event - Context* take_next_pending(timepair_t& when) { + Context* take_next_pending(utime_t& when) { if (pending.empty()) return 0; - map< timepair_t, set >::iterator it = pending.begin(); + map< utime_t, set >::iterator it = pending.begin(); when = it->first; // take and remove @@ -68,7 +68,7 @@ class Timer { } ~Timer() { // scheduled - for (map< timepair_t, set >::iterator it = scheduled.begin(); + for (map< utime_t, set >::iterator it = scheduled.begin(); it != scheduled.end(); it++) { for (set::iterator sit = it->second.begin(); @@ -79,7 +79,7 @@ class Timer { scheduled.clear(); // pending - for (map< timepair_t, set >::iterator it = pending.begin(); + for (map< utime_t, set >::iterator it = pending.begin(); it != pending.end(); it++) { for (set::iterator sit = it->second.begin(); @@ -103,7 +103,7 @@ class Timer { // schedule events void add_event_after(float seconds, Context *callback); - void add_event_at(timepair_t when, + void add_event_at(utime_t when, Context *callback); bool cancel_event(Context *callback); diff --git a/ceph/common/clock.cc b/ceph/common/clock.cc deleted file mode 100644 index a2272ec949fc4..0000000000000 --- a/ceph/common/clock.cc +++ /dev/null @@ -1,55 +0,0 @@ -#include -#include "Clock.h" - -#include "include/config.h" - -#ifndef NULL -#define NULL 0 -#endif - -// public -Clock g_clock; - -// class definition - -// cons -Clock::Clock() { - // set offset to now - start_offset.tv_sec = 0; - start_offset.tv_usec = 0; - //gettime(&start_offset); -} - - -time_t Clock::gettime(struct timeval *ts) -{ - if (g_conf.fake_clock) { - faketime.tv_usec += 100; - while (faketime.tv_usec > 1000000) { - faketime.tv_sec++; - faketime.tv_usec -= 1000000; - } - if (ts) - *ts = faketime; - return faketime.tv_sec; - } else { - // get actual time - struct timeval curtime; - gettimeofday(&curtime,NULL); - - sub(&curtime, &start_offset); - - if (ts) - *ts = curtime; - return curtime.tv_sec; - } -} - - -struct timeval now; - -timepair_t Clock::gettimepair() -{ - gettime(&now); - return timepair_t(now.tv_sec, now.tv_usec); -} diff --git a/ceph/config.cc b/ceph/config.cc index 2381e4593d969..2db42d4be75ec 100644 --- a/ceph/config.cc +++ b/ceph/config.cc @@ -1,5 +1,5 @@ -#include "include/config.h" +#include "config.h" #include "osd/OSDCluster.h" @@ -10,7 +10,8 @@ #define MDS_CACHE_MB_TO_INODES(x) ((x)*1000000/AVG_PER_INODE_SIZE) //#define MDS_CACHE_SIZE MDS_CACHE_MB_TO_INODES( 50 ) -#define MDS_CACHE_SIZE 1500000 +//#define MDS_CACHE_SIZE 1500000 +#define MDS_CACHE_SIZE 150000 // hack hack hack ugly FIXME diff --git a/ceph/fakefuse.cc b/ceph/fakefuse.cc index 556bf16952ee0..39ec93f5208d0 100644 --- a/ceph/fakefuse.cc +++ b/ceph/fakefuse.cc @@ -5,7 +5,7 @@ #include using namespace std; -#include "include/config.h" +#include "config.h" #include "mds/MDCluster.h" #include "mds/MDS.h" @@ -109,7 +109,7 @@ int main(int oargc, char **oargv) { // wait for it to finish cout << "DONE -----" << endl; - fakemessenger_stopthread(); // blocks until messenger stops + fakemessenger_wait(); // blocks until messenger stops // cleanup diff --git a/ceph/fakesyn.cc b/ceph/fakesyn.cc index 95cd407bb6877..9cf44ce79812a 100644 --- a/ceph/fakesyn.cc +++ b/ceph/fakesyn.cc @@ -5,7 +5,7 @@ #include using namespace std; -#include "include/config.h" +#include "config.h" #include "mds/MDCluster.h" #include "mds/MDS.h" @@ -117,7 +117,7 @@ int main(int oargc, char **oargv) char hostname[100]; gethostname(hostname,100); - int pid = getpid(); + //int pid = getpid(); // create mds MDS *mds[NUMMDS]; diff --git a/ceph/include/Distribution.h b/ceph/include/Distribution.h index 27eb740f86909..e8f6e4dcd3a34 100644 --- a/ceph/include/Distribution.h +++ b/ceph/include/Distribution.h @@ -13,7 +13,7 @@ class Distribution { //Distribution() { //} - int get_width() { + unsigned get_width() { return p.size(); } @@ -28,17 +28,17 @@ class Distribution { void random() { float sum = 0.0; - for (int i=0; i 0); return --_ref; } @@ -72,10 +74,10 @@ class buffer { public: // constructors - buffer() : _dataptr(0), _len(0), _alloc_len(0), _ref(0), _myptr(true) { + buffer() : _dataptr(0), _myptr(true), _len(0), _alloc_len(0), _ref(0) { bdbout(1) << "buffer.cons " << *this << endl; } - buffer(int a) : _dataptr(0), _len(a), _alloc_len(a), _ref(0), _myptr(true) { + buffer(unsigned a) : _dataptr(0), _myptr(true), _len(a), _alloc_len(a), _ref(0) { bdbout(1) << "buffer.cons " << *this << endl; _dataptr = new char[a]; bufferlock.Lock(); @@ -94,9 +96,9 @@ class buffer { buffer(const char *p, int l, int mode=BUFFER_MODE_DEFAULT, int alloc_len=0) : _dataptr(0), - _len(l), - _ref(0), - _myptr(0) { + _myptr(false), + _len(l), + _ref(0) { if (alloc_len) _alloc_len = alloc_len; @@ -129,15 +131,15 @@ class buffer { } // accessor - int alloc_length() { + unsigned alloc_length() { return _alloc_len; } - int set_length(int l) { + void set_length(unsigned l) { assert(l <= _alloc_len); _len = l; } - int length() { return _len; } - int unused_tail_length() { return _alloc_len - _len; } + unsigned length() { return _len; } + unsigned unused_tail_length() { return _alloc_len - _len; } friend ostream& operator<<(ostream& out, buffer& b); }; @@ -161,7 +163,7 @@ inline ostream& operator<<(ostream& out, buffer& b) { class bufferptr { private: buffer *_buffer; - int _len, _off; + unsigned _len, _off; public: // empty cons @@ -177,7 +179,7 @@ class bufferptr { _buffer->_get(); } // subset cons - a subset of another bufferptr (subset) - bufferptr(const bufferptr& bp, int len, int off) : + bufferptr(const bufferptr& bp, unsigned len, unsigned off) : _buffer(bp._buffer), _len(len) { _off = bp._off + off; @@ -238,13 +240,13 @@ class bufferptr { char *c_str() { return _buffer->_dataptr + _off; } - int length() { + unsigned length() { return _len; } - int offset() { + unsigned offset() { return _off; } - int unused_tail_length() { + unsigned unused_tail_length() { if (!at_buffer_tail()) return 0; return _buffer->unused_tail_length(); } @@ -252,11 +254,11 @@ class bufferptr { // modifiers - void set_offset(int off) { + void set_offset(unsigned off) { assert(off <= _buffer->_alloc_len); _off = off; } - void set_length(int len) { + void set_length(unsigned len) { assert(len >= 0 && _off + len <= _buffer->_alloc_len); if (_buffer->_len < _off + len) _buffer->_len = _off + len; // set new buffer len (_IF_ i'm expanding it) @@ -265,7 +267,7 @@ class bufferptr { // crope lookalikes - void append(const char *p, int len) { + void append(const char *p, unsigned len) { assert(len + _len + _off <= _buffer->_alloc_len); // FIXME later for auto-expansion? // copy @@ -273,12 +275,12 @@ class bufferptr { _buffer->_len += len; _len += len; } - void copy_out(int off, int len, char *dest) { + void copy_out(unsigned off, unsigned len, char *dest) { assert(off >= 0 && off <= _len); assert(len >= 0 && off + len <= _len); memcpy(dest, c_str() + off, len); } - void copy_in(int off, int len, const char *src) { + void copy_in(unsigned off, unsigned len, const char *src) { assert(off >= 0 && off <= _len); assert(len >= 0 && off + len <= _len); memcpy(c_str() + off, src, len); diff --git a/ceph/include/bufferlist.h b/ceph/include/bufferlist.h index e5ddf417ff743..5d06299b7d70c 100644 --- a/ceph/include/bufferlist.h +++ b/ceph/include/bufferlist.h @@ -13,7 +13,7 @@ using namespace __gnu_cxx; // debug crap -#include "include/config.h" +#include "config.h" #define bdbout(x) if (x <= g_conf.debug_buffer) cout @@ -24,7 +24,7 @@ class bufferlist { * we maintain _len ourselves, so we must be careful when fiddling with buffers! */ list _buffers; - int _len; + unsigned _len; public: // cons/des @@ -56,7 +56,7 @@ class bufferlist { //list::iterator begin() { return _buffers.begin(); } //list::iterator end() { return _buffers.end(); } - int length() { + unsigned length() { #if 0 { // DEBUG: verify _len int len = 0; @@ -121,7 +121,7 @@ class bufferlist { // crope lookalikes - void copy(int off, int len, char *dest) { + void copy(unsigned off, unsigned len, char *dest) { assert(off >= 0); assert(off + len <= length()); @@ -150,7 +150,7 @@ class bufferlist { } // get as much as we can from this buffer. - int howmuch = (*curbuf).length() - off; + unsigned howmuch = (*curbuf).length() - off; (*curbuf).copy_out(off, howmuch, dest); dest += howmuch; @@ -161,7 +161,7 @@ class bufferlist { } } - void copy_in(int off, int len, const char *src) { + void copy_in(unsigned off, unsigned len, const char *src) { assert(off >= 0); assert(off + len <= length()); @@ -190,7 +190,7 @@ class bufferlist { } // get as much as we can from this buffer. - int howmuch = (*curbuf).length() - off; + unsigned howmuch = (*curbuf).length() - off; (*curbuf).copy_in(off, howmuch, src); src += howmuch; @@ -202,19 +202,19 @@ class bufferlist { } - void append(const char *data, int len) { + void append(const char *data, unsigned len) { if (len == 0) return; - int alen = 0; + unsigned alen = 0; // copy into the tail buffer? if (!_buffers.empty()) { - int avail = _buffers.back().unused_tail_length(); + unsigned avail = _buffers.back().unused_tail_length(); if (avail > 0) { //cout << "copying up to " << len << " into tail " << avail << " bytes of tail buf" << endl; if (avail > len) avail = len; - int blen = _buffers.back().length(); + unsigned blen = _buffers.back().length(); memcpy(_buffers.back().c_str() + blen, data, avail); blen += avail; _buffers.back().set_length(blen); @@ -234,7 +234,7 @@ class bufferlist { void append(bufferptr& bp) { push_back(bp); } - void append(bufferptr& bp, int len, int off) { + void append(bufferptr& bp, unsigned len, unsigned off) { bufferptr tempbp(bp, len, off); push_back(tempbp); } @@ -253,7 +253,7 @@ class bufferlist { else { // make one new contiguous buffer. bufferptr newbuf = new buffer(length()); - int off = 0; + unsigned off = 0; for (list::iterator it = _buffers.begin(); it != _buffers.end(); @@ -273,7 +273,7 @@ class bufferlist { } - void substr_of(bufferlist& other, int off, int len) { + void substr_of(bufferlist& other, unsigned off, unsigned len) { assert(off + len <= other.length()); clear(); @@ -304,7 +304,7 @@ class bufferlist { // through end //cout << "copying end (all?) of " << *curbuf << endl; - int howmuch = (*curbuf).length() - off; + unsigned howmuch = (*curbuf).length() - off; _buffers.push_back( bufferptr( *curbuf, howmuch, off ) ); _len += howmuch; len -= howmuch; @@ -314,7 +314,7 @@ class bufferlist { } // funky modifer - void splice(int off, int len, bufferlist *claim_by=0 /*, bufferlist& replace_with */) { // fixme? + void splice(unsigned off, unsigned len, bufferlist *claim_by=0 /*, bufferlist& replace_with */) { // fixme? assert(off < length()); assert(len > 0); //cout << "splice off " << off << " len " << len << " ... mylen = " << length() << endl; @@ -358,7 +358,7 @@ class bufferlist { } // hose though the end - int howmuch = (*curbuf).length() - off; + unsigned howmuch = (*curbuf).length() - off; //cout << "discarding " << howmuch << " of " << *curbuf << endl; if (claim_by) claim_by->append( *curbuf, howmuch, off ); @@ -427,7 +427,7 @@ inline void _decode(set& s, bufferlist& bl, int& off) off += sizeof(v); s.insert(v); } - assert(s.size() == n); + assert(s.size() == (unsigned)n); } // vector @@ -457,7 +457,7 @@ inline void _decode(vector& s, bufferlist& bl, int& off) off += sizeof(v); s[i] = v; } - assert(s.size() == n); + assert(s.size() == (unsigned)n); } diff --git a/ceph/include/filepath.h b/ceph/include/filepath.h index b3893bd94fd7b..2cee26493d8fb 100644 --- a/ceph/include/filepath.h +++ b/ceph/include/filepath.h @@ -27,7 +27,7 @@ class filepath { void parse() { bits.clear(); int off = 0; - while (off < path.length()) { + while (off < (int)path.length()) { // skip trailing/duplicate slash(es) int nextslash = path.find('/', off); if (nextslash == off) { @@ -80,7 +80,7 @@ class filepath { } filepath postfixpath(int s) const { filepath t; - for (int i=s; i using namespace std; -#include "include/config.h" +#include "config.h" @@ -185,8 +185,8 @@ class LRU { void lru_adjust() { if (!lru_max) return; - int toplen = lru_top.get_length(); - __uint32_t topwant = (__uint32_t)(lru_midpoint * (double)lru_max); + unsigned toplen = lru_top.get_length(); + unsigned topwant = (unsigned)(lru_midpoint * (double)lru_max); while (toplen > 0 && toplen > topwant) { // remove from tail of top, stick at head of bot diff --git a/ceph/include/types.h b/ceph/include/types.h index dd9d7164b2ac5..f667fda624be9 100644 --- a/ceph/include/types.h +++ b/ceph/include/types.h @@ -38,7 +38,8 @@ using namespace __gnu_cxx; #define MDS_OP_OPEN 301 #define MDS_OP_TRUNCATE 306 #define MDS_OP_FSYNC 307 -#define MDS_OP_CLOSE 310 +//#define MDS_OP_CLOSE 310 +#define MDS_OP_RELEASE 308 @@ -65,6 +66,31 @@ namespace __gnu_cxx { } +/* + * comparators for stl containers + */ +// for hash_map: +// hash_map, eqstr> vals; +struct eqstr +{ + bool operator()(const char* s1, const char* s2) const + { + return strcmp(s1, s2) == 0; + } +}; + +// for set, map +struct ltstr +{ + bool operator()(const char* s1, const char* s2) const + { + return strcmp(s1, s2) < 0; + } +}; + + + + // -- inode -- typedef __uint64_t inodeno_t; // ino @@ -74,6 +100,10 @@ typedef __uint64_t inodeno_t; // ino #define INODE_MODE_DIR 0040000 // S_IFDIR #define INODE_TYPE_MASK 0170000 +#define FILE_MODE_R 1 +#define FILE_MODE_W 2 +#define FILE_MODE_RW 3 + struct inode_t { // immutable inodeno_t ino; // NOTE: ino _must_ come first for MDStore.cc to behave!! @@ -96,10 +126,14 @@ struct inode_t { }; -// misc other types +// osd types typedef int repgroup_t; // replica group typedef __uint64_t object_t; // object id -typedef __uint32_t fileh_t; // file handle +typedef __uint64_t coll_t; // collection id + +// client types +typedef int fh_t; // file handle + // dentries #define MAX_DENTRY_LEN 255 @@ -206,7 +240,7 @@ inline void _unrope(set& s, crope& r, int& off) off += sizeof(v); s.insert(v); } - assert(s.size() == n); + assert(s.size() == (unsigned)n); } #endif diff --git a/ceph/mds/AnchorTable.cc b/ceph/mds/AnchorTable.cc index a5bd7eec8f2e0..bfb4de7109df0 100644 --- a/ceph/mds/AnchorTable.cc +++ b/ceph/mds/AnchorTable.cc @@ -8,7 +8,7 @@ #include "messages/MAnchorRequest.h" #include "messages/MAnchorReply.h" -#include "include/config.h" +#include "config.h" #undef dout #define dout(x) if (x <= g_conf.debug) cout << "anchortable: " @@ -117,7 +117,7 @@ void AnchorTable::create(inodeno_t ino, vector& trace) dout(7) << "create " << ino << endl; // make sure trace is in table - for (int i=0; iino, trace[i]->dirino, trace[i]->ref_dn); inc(ino); // ok! diff --git a/ceph/mds/CDentry.h b/ceph/mds/CDentry.h index e11abb204d1ac..1e401341c9b90 100644 --- a/ceph/mds/CDentry.h +++ b/ceph/mds/CDentry.h @@ -53,31 +53,31 @@ class CDentry { inode(0), dir(0), remote_ino(0), + dirty(0), + parent_dir_version(0), lockstate(DN_LOCK_SYNC), xlockedby(0), - npins(0), - dirty(0), - parent_dir_version(0) { } + npins(0) { } CDentry(const string& n, inodeno_t ino, CInode *in=0) : name(n), + inode(in), dir(0), remote_ino(ino), - inode(in), + dirty(0), + parent_dir_version(0), lockstate(DN_LOCK_SYNC), xlockedby(0), - npins(0), - dirty(0), - parent_dir_version(0) { } + npins(0) { } CDentry(const string& n, CInode *in) : name(n), - dir(0), inode(in), + dir(0), remote_ino(0), + dirty(0), + parent_dir_version(0), lockstate(DN_LOCK_SYNC), xlockedby(0), - npins(0), - dirty(0), - parent_dir_version(0) { } + npins(0) { } CInode *get_inode() { return inode; } CDir *get_dir() { return dir; } @@ -145,14 +145,14 @@ class CDentry { void pin(Message *m) { npins++; pinset.insert(m); - assert(pinset.size() == npins); + assert(pinset.size() == (unsigned)npins); } void unpin(Message *m) { npins--; assert(npins >= 0); assert(pinset.count(m) > 0); pinset.erase(pinset.find(m)); - assert(pinset.size() == npins); + assert(pinset.size() == (unsigned)npins); } bool is_pinnable(Message *m) { return (lockstate == DN_LOCK_SYNC) || diff --git a/ceph/mds/CDir.cc b/ceph/mds/CDir.cc index c8c8015380269..5014ff5cc28e8 100644 --- a/ceph/mds/CDir.cc +++ b/ceph/mds/CDir.cc @@ -10,12 +10,27 @@ #include -#include "include/config.h" +#include "config.h" #undef dout #define dout(x) if (x <= g_conf.debug) cout << "mds" << mds->get_nodeid() << " cdir: " map cdir_pins; +static char* cdir_pin_names[CDIR_NUM_PINS] = { + "child", + "opened", + "hashed", + "waiter", + "import", + "export", + "freeze", + "proxy", + "authpin", + "imping", + "impgex", + "reqpins", + "dirty" +}; ostream& operator<<(ostream& out, CDir& dir) { diff --git a/ceph/mds/CDir.h b/ceph/mds/CDir.h index 6d9becde09dc7..16012d3482678 100644 --- a/ceph/mds/CDir.h +++ b/ceph/mds/CDir.h @@ -4,7 +4,7 @@ #include "include/types.h" #include "include/bufferlist.h" -#include "include/config.h" +#include "config.h" #include "common/DecayCounter.h" #include @@ -113,21 +113,6 @@ class Context; #define CDIR_PIN_REQUEST 12 #define CDIR_NUM_PINS 13 -static char* cdir_pin_names[CDIR_NUM_PINS] = { - "child", - "opened", - "hashed", - "waiter", - "import", - "export", - "freeze", - "proxy", - "authpin", - "imping", - "impgex", - "reqpins", - "dirty" -}; @@ -351,8 +336,8 @@ class CDir { // for giving to clients - void get_dist_spec(set& ls, int auth, timepair_t& now) { - if (( popularity[MDS_POP_CURDOM].get(now) > g_conf.mds_bal_replicate_threshold)) { + void get_dist_spec(set& ls, int auth) { + if (( popularity[MDS_POP_CURDOM].get() > g_conf.mds_bal_replicate_threshold)) { //if (!cached_by.empty() && inode.ino > 1) dout(1) << "distributed spec for " << *this << endl; ls = open_by; } @@ -635,7 +620,7 @@ class CDirExport { inodeno_t get_ino() { return st.ino; } __uint64_t get_nden() { return st.nden; } - void update_dir(CDir *dir, timepair_t& now) { + void update_dir(CDir *dir) { assert(dir->ino() == st.ino); //dir->nitems = st.nitems; @@ -645,10 +630,10 @@ class CDirExport { dir->dir_auth = st.dir_auth; dir->dir_rep = st.dir_rep; - double newcurdom = st.popularity_curdom.get(now) - dir->popularity[MDS_POP_CURDOM].get(now); + double newcurdom = st.popularity_curdom.get() - dir->popularity[MDS_POP_CURDOM].get(); dir->popularity[MDS_POP_JUSTME].take( st.popularity_justme ); dir->popularity[MDS_POP_CURDOM].take( st.popularity_curdom ); - dir->popularity[MDS_POP_ANYDOM].adjust(now, newcurdom); + dir->popularity[MDS_POP_ANYDOM].adjust(newcurdom); dir->dir_rep_by = rep_by; dir->open_by = open_by; diff --git a/ceph/mds/CFile.h b/ceph/mds/CFile.h deleted file mode 100644 index c78d43f18f625..0000000000000 --- a/ceph/mds/CFile.h +++ /dev/null @@ -1,69 +0,0 @@ -#ifndef __CFILE_H -#define __CFILE_H - -// client open modes -#define CFILE_MODE_R 1 -#define CFILE_MODE_RW 2 -#define CFILE_MODE_W 3 - -// client capabilities (what client is allowed to be doing) -#define CFILE_CAP_RDCACHE 1 -#define CFILE_CAP_RD 2 -#define CFILE_CAP_WR 4 -#define CFILE_CAP_WRBUFFER 8 - - -class CFile { - public: - int client; - fileh_t fh; // file handle - int mode; // mode the file was opened in - int pending_caps; - long last_sent; - int confirmed_caps; // known to be client capabilities - long last_recv; - - CFile() : - client(0), - fh(0), - mode(0), - pending_caps(0), - last_sent(0), - confirmed_caps(0), - last_recv(0) { } -}; - - -// capability helper functions! -inline int file_mode_want_caps(int mode) { - int wants = 0; - if (mode == CFILE_MODE_W ) { - wants |= CFILE_CAP_WR | CFILE_CAP_WRBUFFER; - } - else if (mode == CFILE_MODE_RW) { - wants |= CFILE_CAP_RDCACHE | CFILE_CAP_RD | CFILE_CAP_WR | CFILE_CAP_WRBUFFER; - } - else if (mode == CFILE_MODE_R ) { - wants |= CFILE_CAP_RDCACHE | CFILE_CAP_RD; - } - else assert(0); - return wants; -} - -inline int file_mode_conflict_caps(int mode) { - int conflicts = 0; - if (mode == CFILE_MODE_W ) { - conflicts |= CFILE_CAP_RDCACHE; - } - else if (mode == CFILE_MODE_RW) { - conflicts |= CFILE_CAP_RDCACHE | CFILE_CAP_WRBUFFER; - } - else if (mode == CFILE_MODE_R ) { - conflicts |= CFILE_CAP_WRBUFFER; - } - else assert(0); - return conflicts; -} - - -#endif diff --git a/ceph/mds/CInode.cc b/ceph/mds/CInode.cc index 58820fa8608aa..268da281af645 100644 --- a/ceph/mds/CInode.cc +++ b/ceph/mds/CInode.cc @@ -8,7 +8,7 @@ #include -#include "include/config.h" +#include "config.h" #undef dout #define dout(x) if (x <= g_conf.debug) cout << "cinode: " @@ -76,8 +76,6 @@ CInode::CInode(bool auth) : LRUObject(), state = 0; version = 0; - nrdonly = nwronly = nrdwr = 0; - if (auth) state_set(CINODE_STATE_AUTH); } @@ -129,8 +127,8 @@ void CInode::set_auth(bool a) { if (!is_dangling() && !is_root() && is_auth() != a) { - CDir *dir = get_parent_dir(); /* + CDir *dir = get_parent_dir(); if (is_auth() && !a) dir->nauthitems--; else diff --git a/ceph/mds/CInode.h b/ceph/mds/CInode.h index b2fe4d3f0657d..261a1e5ae0044 100644 --- a/ceph/mds/CInode.h +++ b/ceph/mds/CInode.h @@ -2,15 +2,15 @@ #ifndef __CINODE_H #define __CINODE_H -#include "include/config.h" +#include "config.h" #include "include/types.h" #include "include/lru.h" #include "common/DecayCounter.h" #include #include "CDentry.h" -#include "CFile.h" #include "Lock.h" +#include "Capability.h" #include #include @@ -203,11 +203,8 @@ class CInode : LRUObject { multimap waiting; // issued client capabilities - map caps; + map caps; // client -> caps - // open file state (me) - map fh_map; // locally opened files - int nrdonly, nrdwr, nwronly; // file mode counts private: // lock nesting @@ -260,12 +257,6 @@ class CInode : LRUObject { } bool dir_is_auth(); - /* - float get_popularity() { - timepair_t now = g_clock.gettimepair(); - return popularity[0].get(now); - } - */ // -- misc -- @@ -410,38 +401,31 @@ class CInode : LRUObject { assert(caps.count(client) == 1); caps.erase(client); } - - // -- open files -- (old) - bool is_open_write() { return nwronly; } - bool is_open_read() { return nrdonly; } - bool is_open() { return is_open_write() || is_open_read(); } - - int get_num_fh() { return fh_map.size(); } - CFile* get_fh(int fh) { - if (fh_map.count(fh)) return fh_map[fh]; + Capability* get_cap(int client) { + if (caps.count(client)) + return &caps[client]; return 0; } - void add_fh(CFile *f) { - if (f->mode == CFILE_MODE_R) nrdonly++; - if (f->mode == CFILE_MODE_W) { - nwronly++; - softlock.get_write(); - } - if (f->mode == CFILE_MODE_RW) nrdwr++; - - if (fh_map.empty()) get(CINODE_PIN_OPEN); - fh_map[f->fh] = f; - } - void remove_fh(CFile *f) { - if (f->mode == CFILE_MODE_R) nrdonly--; - if (f->mode == CFILE_MODE_W) { - nwronly--; - softlock.put_write(); - } - if (f->mode == CFILE_MODE_RW) nrdwr--; - fh_map.erase(f->fh); - if (fh_map.empty()) put(CINODE_PIN_OPEN); + void add_caps(map& cl) { + caps.clear(); + caps = cl; + } + void remove_caps(map& cl) { + cl = caps; + caps.clear(); + } + + int get_caps_issued() { + int c = 0; + for (map::iterator it = caps.begin(); + it != caps.end(); + it++) + c |= it->second.issued(); + return c; + } + bool is_write_caps() { + return get_caps_issued() & CAP_FILE_WR; } @@ -575,7 +559,7 @@ class CInodeDiscover { inodeno_t get_ino() { return inode.ino; } int get_replica_nonce() { return replica_nonce; } - int update_inode(CInode *in) { + void update_inode(CInode *in) { in->inode = inode; in->replica_nonce = replica_nonce; @@ -615,7 +599,7 @@ typedef struct { bool is_dirty; // dirty inode? int ncached_by; // int pairs follow - int num_fh; + int num_caps; } CInodeExport_st; @@ -624,7 +608,7 @@ class CInodeExport { CInodeExport_st st; set cached_by; map cached_by_nonce; - list fh_list; + map cap_map; CLock hardlock,softlock; @@ -643,34 +627,23 @@ public: st.popularity_curdom.take( in->popularity[MDS_POP_CURDOM] ); in->popularity[MDS_POP_ANYDOM].adjust_down(st.popularity_curdom); - // suck up fh's from inode - for (map::iterator it = in->fh_map.begin(); - it != in->fh_map.end(); - it++) { - fh_list.push_back(it->second); - } - for (list::iterator it = fh_list.begin(); - it != fh_list.end(); - it++) - in->remove_fh(*it); + // steal caps from inode + in->remove_caps(cap_map); } ~CInodeExport() { - for (list::iterator it = fh_list.begin(); - it != fh_list.end(); - it++) delete *it; } inodeno_t get_ino() { return st.inode.ino; } - int update_inode(CInode *in, timepair_t& now) { + void update_inode(CInode *in) { in->inode = st.inode; in->version = st.version; - double newcurdom = st.popularity_curdom.get(now) - in->popularity[MDS_POP_CURDOM].get(now); + double newcurdom = st.popularity_curdom.get() - in->popularity[MDS_POP_CURDOM].get(); in->popularity[MDS_POP_JUSTME].take( st.popularity_justme ); in->popularity[MDS_POP_CURDOM].take( st.popularity_curdom ); - in->popularity[MDS_POP_ANYDOM].adjust(now, newcurdom); + in->popularity[MDS_POP_ANYDOM].adjust(newcurdom); if (st.is_dirty) { in->mark_dirty(); @@ -685,19 +658,13 @@ public: in->hardlock = hardlock; in->softlock = softlock; - // fh's - for (list::iterator it = fh_list.begin(); - it != fh_list.end(); - it++) { - in->add_fh(*it); - } - fh_list.clear(); - + // caps + in->add_caps(cap_map); } void _encode(bufferlist& bl) { st.ncached_by = cached_by.size(); - st.num_fh = fh_list.size(); + st.num_caps = cap_map.size(); bl.append((char*)&st, sizeof(st)); // cached_by + nonce @@ -713,13 +680,12 @@ public: hardlock.encode_state(bl); softlock.encode_state(bl); - // fh - for (list::iterator it = fh_list.begin(); - it != fh_list.end(); + // caps + for (map::iterator it = cap_map.begin(); + it != cap_map.end(); it++) { - bl.append((char*)*it, sizeof(CFile)); - CFile *f = *it; - //cout << "f in client " << f->client << " fh " << f->fh << endl; + bl.append((char*)&it->first, sizeof(it->first)); + it->second._encode(bl); } } @@ -740,13 +706,12 @@ public: hardlock.decode_state(bl, off); softlock.decode_state(bl, off); - // fh - for (int i=0; iclient << " fh " << f->fh << endl; - off += sizeof(CFile); - fh_list.push_back(f); + // caps + for (int i=0; i +using namespace std; + +#include "config.h" + + // definite caps #define CAP_FILE_RDCACHE 1 #define CAP_FILE_RD 2 #define CAP_FILE_WR 4 #define CAP_FILE_WRBUFFER 8 -#define CAP_INODE_STAT 16 +//#define CAP_INODE_STAT 16 // heuristics -#define CAP_FILE_DELAYFLUSH 32 +//#define CAP_FILE_DELAYFLUSH 32 + +inline string cap_string(int cap) +{ + string s; + s = "["; + if (cap & CAP_FILE_RDCACHE) s += " rdcache"; + if (cap & CAP_FILE_RD) s += " rd"; + if (cap & CAP_FILE_WR) s += " wr"; + if (cap & CAP_FILE_WRBUFFER) s += " wrbuffer"; + s += " ]"; + return s; +} + class Capability { int wanted_caps; // what the client wants - int pending_caps; // what we've sent them - int confirmed_caps; // what they've confirmed they've received - Capability(int wants = 0) : - wanted_caps(wants), - pending_caps(0), - confirmed_caps(0) { } + map cap_history; // seq -> cap + long last_sent, last_recv; + +public: + Capability(int want=0) : + wanted_caps(want), + last_sent(0), + last_recv(0) { + cap_history[last_sent] = 0; + } + + + // most recently issued caps. + int pending() { + return cap_history[ last_sent ]; + } + + // caps client has confirmed receipt of + int confirmed() { + return cap_history[ last_recv ]; + } + + // caps potentially issued + int issued() { + int c = 0; + for (long seq = last_recv; seq <= last_sent; seq++) { + c |= cap_history[seq]; + dout(10) << "cap issued: " << seq << " " << cap_history[seq] << " -> " << c << endl; + } + return c; + } + + // caps this client wants to hold + int wanted() { return wanted_caps; } + void set_wanted(int w) { + wanted_caps = w; + } + + // conflicts + int conflicts(int from) { + int c = 0; + if (from & CAP_FILE_WR) c |= CAP_FILE_RDCACHE; + if (from & CAP_FILE_RD) c |= CAP_FILE_WRBUFFER; + return c; + } + int wanted_conflicts() { + return conflicts(wanted_caps); + } + int issued_conflicts() { + return conflicts(issued()); + } + + // issue caps; return seq number. + long issue(int c) { + ++last_sent; + cap_history[last_sent] = c; + return last_sent; + } + long get_last_seq() { return last_sent; } + + void confirm_receipt(long seq) { + while (last_recv < seq) { + cap_history.erase(last_recv); + ++last_recv; + } + } + + // serializers + void _encode(bufferlist& bl) { + bl.append((char*)&wanted_caps, sizeof(wanted_caps)); + bl.append((char*)&last_sent, sizeof(last_sent)); + bl.append((char*)&last_recv, sizeof(last_recv)); + for (long seq = last_recv; seq <= last_sent; seq++) { + int c = cap_history[seq]; + bl.append((char*)&c, sizeof(c)); + } + } + void _decode(bufferlist& bl, int& off) { + bl.copy(off, sizeof(wanted_caps), (char*)&wanted_caps); + off += sizeof(wanted_caps); + bl.copy(off, sizeof(last_sent), (char*)&last_sent); + off += sizeof(last_sent); + bl.copy(off, sizeof(last_recv), (char*)&last_recv); + off += sizeof(last_recv); + for (long seq=last_recv; seq<=last_sent; seq++) { + int c; + bl.copy(off, sizeof(c), (char*)&c); + off += sizeof(c); + cap_history[seq] = c; + } + } + }; +/* +// capability helper functions! +inline int file_mode_want_caps(int mode) { + int wants = 0; + if (mode == CFILE_MODE_W ) { + wants |= CFILE_CAP_WR | CFILE_CAP_WRBUFFER; + } + else if (mode == CFILE_MODE_RW) { + wants |= CFILE_CAP_RDCACHE | CFILE_CAP_RD | CFILE_CAP_WR | CFILE_CAP_WRBUFFER; + } + else if (mode == CFILE_MODE_R ) { + wants |= CFILE_CAP_RDCACHE | CFILE_CAP_RD; + } + else assert(0); + return wants; +} +*/ + + #endif diff --git a/ceph/mds/IdAllocator.cc b/ceph/mds/IdAllocator.cc index 0e2432bb5d3b4..c5355dd73c051 100644 --- a/ceph/mds/IdAllocator.cc +++ b/ceph/mds/IdAllocator.cc @@ -11,7 +11,7 @@ #include "include/types.h" -#include "include/config.h" +#include "config.h" #undef dout #define dout(x) if (x <= g_conf.debug) cout << "mds" << mds->get_nodeid() << ".idalloc: " @@ -103,12 +103,12 @@ void IdAllocator::reset() free.clear(); // use generic range FIXME THIS IS CRAP - free[ID_INO].map_insert((long long)100000000LL * (mds->get_nodeid()+1), - (long long)100000000LL * (mds->get_nodeid()+2) - 1); + free[ID_INO].map_insert((long long)100000000LL * (long long)(mds->get_nodeid()+1), + (long long)100000000LL * (long long)(mds->get_nodeid()+2) - 1LL); //free[ID_INO].dump(); - free[ID_FH].map_insert(10000000LL * (mds->get_nodeid()+1), - 10000000LL * (mds->get_nodeid()+2) - 1); + //free[ID_FH].map_insert(10000000LL * (mds->get_nodeid()+1), + //10000000LL * (mds->get_nodeid()+2) - 1); //free[ID_FH].dump(); opened = true; diff --git a/ceph/mds/IdAllocator.h b/ceph/mds/IdAllocator.h index 15dadd56953c0..d89a36d4dc188 100644 --- a/ceph/mds/IdAllocator.h +++ b/ceph/mds/IdAllocator.h @@ -9,7 +9,7 @@ class MDS; #define ID_INO 1 // inode -#define ID_FH 2 // file handle +//#define ID_FH 2 // file handle typedef __uint64_t idno_t; diff --git a/ceph/mds/Lock.h b/ceph/mds/Lock.h index f3415b87e7314..70f48125e3ecf 100644 --- a/ceph/mds/Lock.h +++ b/ceph/mds/Lock.h @@ -96,11 +96,12 @@ class CLock { char set_state(char s) { state = s; assert(!is_stable() || gather_set.size() == 0); // gather should be empty in stable states. + return s; }; char get_mode() { return mode; } char set_mode(char m) { - mode = m; + return mode = m; } char get_replica_state() { @@ -218,7 +219,7 @@ inline ostream& operator<<(ostream& out, CLock& l) "wgsync" }; - out << "(" << __lock_states[l.get_state()]; + out << "(" << __lock_states[(int)l.get_state()]; if (!l.get_gather_set().empty()) out << " g=" << l.get_gather_set(); diff --git a/ceph/mds/LogEvent.h b/ceph/mds/LogEvent.h index 432700250716a..6a0311e80282e 100644 --- a/ceph/mds/LogEvent.h +++ b/ceph/mds/LogEvent.h @@ -12,7 +12,7 @@ using namespace std; #define EVENT_UNLINK 3 #define EVENT_ALLOC 4 -#include "include/config.h" +#include "config.h" // generic log event @@ -22,6 +22,7 @@ class LogEvent { public: LogEvent(int t) : _type(t) { } + virtual ~LogEvent() { } int get_type() { return _type; } diff --git a/ceph/mds/LogStream.cc b/ceph/mds/LogStream.cc index a4541287a918f..85b407207c14f 100644 --- a/ceph/mds/LogStream.cc +++ b/ceph/mds/LogStream.cc @@ -13,7 +13,7 @@ #include using namespace std; -#include "include/config.h" +#include "config.h" #undef dout #define dout(l) if (mds->get_nodeid() == 0 && (l<=g_conf.debug || l<=g_conf.debug_mds_log)) cout << "mds" << mds->get_nodeid() << ".logstream " diff --git a/ceph/mds/MDBalancer.cc b/ceph/mds/MDBalancer.cc index cf4a5a0bc9512..85836cde06c2a 100644 --- a/ceph/mds/MDBalancer.cc +++ b/ceph/mds/MDBalancer.cc @@ -15,7 +15,7 @@ #include using namespace std; -#include "include/config.h" +#include "config.h" #undef dout #define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_mds_balancer) cout << "mds" << mds->get_nodeid() << ".bal " @@ -85,8 +85,6 @@ void MDBalancer::send_heartbeat() if (mds->get_nodeid() == 0) beat_epoch++; - timepair_t now = g_clock.gettimepair(); - // load mds_load_t load = mds->get_load(); mds_load[ mds->get_nodeid() ] = load; @@ -100,7 +98,7 @@ void MDBalancer::send_heartbeat() CDir *im = *it; if (im->inode->is_root()) continue; int from = im->inode->authority(); - import_map[from] += im->popularity[MDS_POP_CURDOM].get(now); + import_map[from] += im->popularity[MDS_POP_CURDOM].get(); } mds_import_map[ mds->get_nodeid() ] = import_map; @@ -147,7 +145,7 @@ void MDBalancer::handle_heartbeat(MHeartbeat *m) //cout << " load is " << load << " have " << mds_load.size() << endl; - int cluster_size = mds->get_cluster()->get_num_mds(); + unsigned cluster_size = mds->get_cluster()->get_num_mds(); if (mds_load.size() == cluster_size) { // let's go! export_empties(); @@ -205,8 +203,6 @@ double MDBalancer::try_match(int ex, double& maxex, void MDBalancer::do_rebalance(int beat) { - timepair_t now = g_clock.gettimepair(); - int cluster_size = mds->get_cluster()->get_num_mds(); int whoami = mds->get_nodeid(); @@ -334,7 +330,7 @@ void MDBalancer::do_rebalance(int beat) for (set::iterator it = mds->mdcache->imports.begin(); it != mds->mdcache->imports.end(); it++) { - double pop = (*it)->popularity[MDS_POP_CURDOM].get(now); + double pop = (*it)->popularity[MDS_POP_CURDOM].get(); if (pop < g_conf.mds_bal_idle_threshold && (*it)->inode != mds->mdcache->get_root()) { dout(5) << " exporting idle import " << **it << endl; @@ -373,7 +369,7 @@ void MDBalancer::do_rebalance(int beat) if (dir->inode->is_root()) continue; if (dir->is_freezing() || dir->is_frozen()) continue; // export pbly already in progress - double pop = dir->popularity[MDS_POP_CURDOM].get(now); + double pop = dir->popularity[MDS_POP_CURDOM].get(); assert(dir->inode->authority() == target); // cuz that's how i put it in the map, dummy if (pop <= amount) { @@ -422,7 +418,7 @@ void MDBalancer::do_rebalance(int beat) } for (list::iterator it = exports.begin(); it != exports.end(); it++) { - dout(5) << " exporting fragment " << **it << " pop " << (*it)->popularity[MDS_POP_CURDOM].get(now) << endl; + dout(5) << " exporting fragment " << **it << " pop " << (*it)->popularity[MDS_POP_CURDOM].get() << endl; mds->mdcache->export_dir(*it, target); } } @@ -448,8 +444,6 @@ void MDBalancer::find_exports(CDir *dir, double midchunk = need * .3; double minchunk = need * .001; - timepair_t now = g_clock.gettimepair(); - list bigger; multimap smaller; @@ -468,7 +462,7 @@ void MDBalancer::find_exports(CDir *dir, if (in->dir->is_frozen()) continue; // can't export this right now! if (in->dir->get_size() == 0) continue; // don't export empty dirs, even if they're not complete. for now! - double pop = in->dir->popularity[MDS_POP_CURDOM].get(now); + double pop = in->dir->popularity[MDS_POP_CURDOM].get(); //cout << " in " << in->inode.ino << " " << pop << endl; @@ -538,40 +532,38 @@ void MDBalancer::find_exports(CDir *dir, void MDBalancer::hit_inode(CInode *in) { // hit me - timepair_t now = g_clock.gettimepair(); - in->popularity[MDS_POP_JUSTME].hit(now); - in->popularity[MDS_POP_NESTED].hit(now); + in->popularity[MDS_POP_JUSTME].hit(); + in->popularity[MDS_POP_NESTED].hit(); if (in->is_auth()) { - in->popularity[MDS_POP_CURDOM].hit(now); - in->popularity[MDS_POP_ANYDOM].hit(now); + in->popularity[MDS_POP_CURDOM].hit(); + in->popularity[MDS_POP_ANYDOM].hit(); } // hit auth up to import CDir *dir = in->get_parent_dir(); - if (dir) hit_recursive(dir, now); + if (dir) hit_recursive(dir); } void MDBalancer::hit_dir(CDir *dir) { // hit me - timepair_t now = g_clock.gettimepair(); - dir->popularity[MDS_POP_JUSTME].hit(now); + dir->popularity[MDS_POP_JUSTME].hit(); - hit_recursive(dir, now); + hit_recursive(dir); } -void MDBalancer::hit_recursive(CDir *dir, timepair_t& now) +void MDBalancer::hit_recursive(CDir *dir) { bool anydom = dir->is_auth(); bool curdom = dir->is_auth(); // replicate? - float dir_pop = dir->popularity[MDS_POP_CURDOM].get(now); // hmm?? + float dir_pop = dir->popularity[MDS_POP_CURDOM].get(); // hmm?? if (dir->is_auth()) { if (!dir->is_rep() && @@ -597,17 +589,17 @@ void MDBalancer::hit_recursive(CDir *dir, timepair_t& now) while (dir) { CInode *in = dir->inode; - dir->popularity[MDS_POP_NESTED].hit(now); - in->popularity[MDS_POP_NESTED].hit(now); + dir->popularity[MDS_POP_NESTED].hit(); + in->popularity[MDS_POP_NESTED].hit(); if (anydom) { - dir->popularity[MDS_POP_ANYDOM].hit(now); - in->popularity[MDS_POP_ANYDOM].hit(now); + dir->popularity[MDS_POP_ANYDOM].hit(); + in->popularity[MDS_POP_ANYDOM].hit(); } if (curdom) { - dir->popularity[MDS_POP_CURDOM].hit(now); - in->popularity[MDS_POP_CURDOM].hit(now); + dir->popularity[MDS_POP_CURDOM].hit(); + in->popularity[MDS_POP_CURDOM].hit(); } if (dir->is_import()) @@ -622,48 +614,46 @@ void MDBalancer::hit_recursive(CDir *dir, timepair_t& now) */ void MDBalancer::subtract_export(CDir *dir) { - timepair_t now = g_clock.gettimepair(); - double curdom = -dir->popularity[MDS_POP_CURDOM].get(now); + double curdom = -dir->popularity[MDS_POP_CURDOM].get(); bool in_domain = !dir->is_import(); while (true) { CInode *in = dir->inode; - in->popularity[MDS_POP_ANYDOM].adjust(now, curdom); - if (in_domain) in->popularity[MDS_POP_CURDOM].adjust(now, curdom); + in->popularity[MDS_POP_ANYDOM].adjust(curdom); + if (in_domain) in->popularity[MDS_POP_CURDOM].adjust(curdom); dir = in->get_parent_dir(); if (!dir) break; if (dir->is_import()) in_domain = false; - dir->popularity[MDS_POP_ANYDOM].adjust(now, curdom); - if (in_domain) dir->popularity[MDS_POP_CURDOM].adjust(now, curdom); + dir->popularity[MDS_POP_ANYDOM].adjust(curdom); + if (in_domain) dir->popularity[MDS_POP_CURDOM].adjust(curdom); } } void MDBalancer::add_import(CDir *dir) { - timepair_t now = g_clock.gettimepair(); - double curdom = dir->popularity[MDS_POP_CURDOM].get(now); + double curdom = dir->popularity[MDS_POP_CURDOM].get(); bool in_domain = !dir->is_import(); while (true) { CInode *in = dir->inode; - in->popularity[MDS_POP_ANYDOM].adjust(now, curdom); - if (in_domain) in->popularity[MDS_POP_CURDOM].adjust(now, curdom); + in->popularity[MDS_POP_ANYDOM].adjust(curdom); + if (in_domain) in->popularity[MDS_POP_CURDOM].adjust(curdom); dir = in->get_parent_dir(); if (!dir) break; if (dir->is_import()) in_domain = false; - dir->popularity[MDS_POP_ANYDOM].adjust(now, curdom); - if (in_domain) dir->popularity[MDS_POP_CURDOM].adjust(now, curdom); + dir->popularity[MDS_POP_ANYDOM].adjust(curdom); + if (in_domain) dir->popularity[MDS_POP_CURDOM].adjust(curdom); } } @@ -686,13 +676,11 @@ void MDBalancer::show_imports(bool external) set ecopy = mds->mdcache->exports; - timepair_t now = g_clock.gettimepair(); - for (set::iterator it = mds->mdcache->imports.begin(); it != mds->mdcache->imports.end(); it++) { CDir *im = *it; - dout(db) << " + import (" << im->popularity[MDS_POP_CURDOM].get(now) << "/" << im->popularity[MDS_POP_ANYDOM].get(now) << ") " << *im << endl; + dout(db) << " + import (" << im->popularity[MDS_POP_CURDOM].get() << "/" << im->popularity[MDS_POP_ANYDOM].get() << ") " << *im << endl; assert( im->is_import() ); assert( im->is_auth() ); @@ -700,7 +688,7 @@ void MDBalancer::show_imports(bool external) p != mds->mdcache->nested_exports[im].end(); p++) { CDir *exp = *p; - dout(db) << " - ex (" << exp->popularity[MDS_POP_NESTED].get(now) << ", " << exp->popularity[MDS_POP_ANYDOM].get(now) << ") " << *exp << " to " << exp->dir_auth << endl; + dout(db) << " - ex (" << exp->popularity[MDS_POP_NESTED].get() << ", " << exp->popularity[MDS_POP_ANYDOM].get() << ") " << *exp << " to " << exp->dir_auth << endl; assert( exp->is_export() ); assert( !exp->is_auth() ); diff --git a/ceph/mds/MDBalancer.h b/ceph/mds/MDBalancer.h index 464c2d6d6b138..db15e3a261a3b 100644 --- a/ceph/mds/MDBalancer.h +++ b/ceph/mds/MDBalancer.h @@ -69,7 +69,7 @@ class MDBalancer { void hit_inode(class CInode *in); void hit_dir(class CDir *dir); - void hit_recursive(class CDir *dir, timepair_t& now); + void hit_recursive(class CDir *dir); void show_imports(bool external=false); diff --git a/ceph/mds/MDCache.cc b/ceph/mds/MDCache.cc index 64ef08bc864d1..23499412a46ef 100644 --- a/ceph/mds/MDCache.cc +++ b/ceph/mds/MDCache.cc @@ -69,7 +69,7 @@ #include using namespace std; -#include "include/config.h" +#include "config.h" #undef dout #define dout(l) if (l<=g_conf.debug) cout << "mds" << mds->get_nodeid() << ".cache " @@ -102,6 +102,7 @@ bool MDCache::shutdown() show_imports(); //dump(); } + return true; } @@ -441,7 +442,7 @@ void MDCache::export_empty_import(CDir *dir) } -bool MDCache::trim(__int32_t max) { +bool MDCache::trim(int max) { if (max < 0) { max = lru.lru_get_max(); if (!max) return false; @@ -449,7 +450,7 @@ bool MDCache::trim(__int32_t max) { map expiremap; - while (lru.lru_get_size() > max) { + while (lru.lru_get_size() > (unsigned)max) { CInode *in = (CInode*)lru.lru_expire(); if (!in) break; //return false; @@ -756,6 +757,8 @@ int MDCache::open_root(Context *c) waiting_for_root.push_back(c); } + + return 0; } @@ -973,7 +976,7 @@ int MDCache::path_traverse(filepath& origpath, // make our own copy, since we'll modify when we hit symlinks filepath path = origpath; - int depth = 0; + unsigned depth = 0; while (depth < path.depth()) { dout(12) << "traverse: path seg depth " << depth << " = " << path[depth] << endl; @@ -1302,7 +1305,7 @@ void MDCache::open_remote_ino_2(inodeno_t ino, // construct path filepath path; - for (int i=0; iref_dn); dout(7) << " path is " << path << endl; @@ -1768,7 +1771,7 @@ void MDCache::handle_discover(MDiscover *dis) // add content // do some fidgeting to include a dir if they asked for the base dir, or just root. - for (int i = 0; i < dis->get_want().depth() || dis->get_want().depth() == 0; i++) { + for (unsigned i = 0; i < dis->get_want().depth() || dis->get_want().depth() == 0; i++) { // add dir if (reply->is_empty() && !dis->wants_base_dir()) { dout(7) << "they don't want the base dir" << endl; @@ -2695,7 +2698,7 @@ void MDCache::file_rename(CDentry *srcdn, CDentry *destdn, Context *onfinish) string destname = destdn->name; CInode *in = srcdn->inode; - Message *req = srcdn->xlockedby; + //Message *req = srcdn->xlockedby; // determine the players @@ -3080,11 +3083,10 @@ void MDCache::handle_rename(MRename *m) // decode + import inode (into new location start) int off = 0; - timepair_t now = g_clock.gettimepair(); // HACK bufferlist bufstate; bufstate.append(m->get_inode_state().c_str(), m->get_inode_state().length()); - decode_import_inode(destdn, bufstate, off, m->get_source(), now); + decode_import_inode(destdn, bufstate, off, m->get_source()); CInode *in = destdn->inode; assert(in); @@ -3288,51 +3290,64 @@ __uint64_t MDCache::issue_file_data_version(CInode *in) } -int MDCache::issue_file_caps(CInode *in, - int mode, - MClientRequest *req) +Capability* MDCache::issue_file_caps(CInode *in, + int mode, + MClientRequest *req) { dout(7) << "issue_file_caps for mode " << mode << " on " << *in << endl; // my needs - int my_want = file_mode_want_caps(mode); - int my_conflicts = file_mode_conflict_caps(mode); + int my_client = req->get_client(); + int my_want = 0; + if (mode & FILE_MODE_R) my_want |= CAP_FILE_RDCACHE | CAP_FILE_RD; + if (mode & FILE_MODE_W) my_want |= CAP_FILE_WRBUFFER | CAP_FILE_WR; + + // register a capability + Capability *cap = in->get_cap(my_client); + if (!cap) { + Capability c(my_want); + in->add_cap(my_client, c); + cap = in->get_cap(my_client); + } // look at what caps are already issued int issued = 0; int want = my_want; - int conflicts = my_conflicts; - for (map::iterator it = in->fh_map.begin(); - it != in->fh_map.end(); + int conflicts = 0; + for (map::iterator it = in->caps.begin(); + it != in->caps.end(); it++) { - issued |= it->second->pending_caps | it->second->confirmed_caps; - want |= file_mode_want_caps( it->second->mode ); - conflicts |= file_mode_conflict_caps( it->second->mode ); + issued |= it->second.issued(); + want |= it->second.wanted(); + if (it->first != my_client) + conflicts |= it->second.wanted_conflicts(); } - dout(10) << " issued: " << issued << endl; - dout(10) << " want: " << want << endl; - dout(10) << " conflicts: " << conflicts << endl; + dout(10) << " issued: " << cap_string(issued) << endl; + dout(10) << " want: " << cap_string(want) << endl; + dout(10) << " want conflicts: " << cap_string(conflicts) << endl; // what's allowed, given this mix? int allowed = want - (want & conflicts); - dout(10) << " allowed: " << allowed << endl; + dout(10) << " allowed: " << cap_string(allowed) << endl; // problems? if (issued & conflicts) { - dout(7) << " conflict with existing fh's: " << (issued & conflicts) << endl; + dout(7) << " conflict with existing caps: " << cap_string(issued & conflicts) << endl; // call back caps! - for (map::iterator it = in->fh_map.begin(); - it != in->fh_map.end(); + for (map::iterator it = in->caps.begin(); + it != in->caps.end(); it++) { - CFile *f = it->second; - int extra = f->pending_caps - (f->pending_caps & allowed); - dout(7) << " fh " << f->fh << " on client " << f->client << " has pending " << f->pending_caps << " .. extra is " << extra << endl; + int extra = it->second.pending() - (it->second.pending() & allowed); + dout(7) << " client " << it->first << " has pending " << it->second.pending() << " .. extra is " << extra << endl; if (extra) { - f->pending_caps -= extra; - dout(7) << " sending MClientFileCaps on " << f->fh << " new pending " << f->pending_caps << endl; - mds->messenger->send_message(new MClientFileCaps(in, f, true), - MSG_ADDR_CLIENT(f->client), 0, MDS_PORT_CACHE); + // issue restricted caps + it->second.issue(it->second.pending() - extra); + + // send + dout(7) << " sending MClientFileCaps on " << it->first << " new pending " << it->second.pending() << endl; + mds->messenger->send_message(new MClientFileCaps(in, it->second, true), + MSG_ADDR_CLIENT(it->first), 0, MDS_PORT_CACHE); } } @@ -3342,17 +3357,20 @@ int MDCache::issue_file_caps(CInode *in, // we're okay! int caps = my_want & allowed; - dout(7) << " issuing caps " << caps << " (i want " << my_want << ", allowed " << allowed << ")" << endl; + dout(7) << " issuing caps " << cap_string(caps) << " (i want " << cap_string(my_want) << ", allowed " << cap_string(allowed) << ")" << endl; assert(caps > 0); + + // issue + cap->issue(caps); // issuing new write permissions? - if ((issued & CFILE_CAP_WR) == 0 && - ( caps & CFILE_CAP_WR) != 0) { + if ((issued & CAP_FILE_WR) == 0 && + ( caps & CAP_FILE_WR) != 0) { dout(7) << " incrementing file_data_version for " << *in << endl; in->inode.file_data_version++; } - return caps; + return cap; } void MDCache::handle_client_file_caps(MClientFileCaps *m) @@ -3373,20 +3391,15 @@ void MDCache::handle_client_file_caps(MClientFileCaps *m) return; } - dout(7) << "handle_client_file_caps on " << *in << " fh " << m->get_fh() << " confirmed caps " << m->get_caps() << endl; + dout(7) << "handle_client_file_caps on " << *in << " confirmed caps " << m->get_caps() << endl; - CFile *f = in->get_fh(m->get_fh()); - assert(f); + Capability *cap = in->get_cap(m->get_client()); + assert(cap); - if (m->get_seq() < f->last_recv) { - dout(7) << "older than last_recv, dropping" << endl; - } else { - f->confirmed_caps = m->get_caps(); - f->last_recv = m->get_seq(); - - // reevaluate, waiters - eval_file_caps(in); - } + cap->confirm_receipt(m->get_seq()); + + // reevaluate, waiters + eval_file_caps(in); delete m; } @@ -4143,7 +4156,7 @@ void MDCache::inode_soft_eval(CInode *in) inode_soft_mode(in,LOCK_MODE_ASYNC); } */ - if (!in->is_open_write() && + if (!in->is_write_caps() && in->softlock.get_mode() != LOCK_MODE_SYNC) { inode_soft_mode(in,LOCK_MODE_SYNC); } @@ -5220,18 +5233,11 @@ void MDCache::handle_lock_dn(MLock *m) case LOCK_AC_UNXLOCK: dout(7) << "handle_lock_dn unxlock on " << *dn << endl; { - CDir *dir = dn->dir; string dname = dn->name; - Message *m = dn->xlockedby; // finish request request_finish(m); // this will drop the locks (and unpin paths!) - - // unxlock - //dentry_xlock_finish(dn); - //dir->take_waiting(CDIR_WAIT_ANY, dname, mds->finished_queue); - return; } break; @@ -5713,14 +5719,13 @@ void MDCache::encode_export_inode(CInode *in, bufferlist& enc_state, int new_aut { in->version++; // so local log entries are ignored, etc. (FIXME ??) - // tell clients with fh's about new inode auth - for (map::iterator it = in->fh_map.begin(); - it != in->fh_map.end(); + // tell clients with caps about new inode auth + for (map::iterator it = in->caps.begin(); + it != in->caps.end(); it++) { - CFile *f = it->second; - dout(7) << "encode_export_inode " << *in << " telling client " << f->client << " fh " << f->fh << " new auth " << new_auth << endl; - mds->messenger->send_message(new MClientFileCaps(in, f, false, new_auth), - MSG_ADDR_CLIENT(f->client)); + dout(7) << "encode_export_inode " << *in << " telling client " << it->first << " new auth " << new_auth << endl; + mds->messenger->send_message(new MClientFileCaps(in, it->second, false, new_auth), + MSG_ADDR_CLIENT(it->first)); } // relax locks @@ -6024,10 +6029,10 @@ void MDCache::handle_export_dir_discover(MExportDirDiscover *m) // must discover it! C_MDC_ExportDirDiscover *onfinish = new C_MDC_ExportDirDiscover(this, m); filepath fpath(m->get_path()); - int r = path_traverse(fpath, onfinish->trace, true, - m, new C_MDS_RetryMessage(mds,m), // on delay/retry - MDS_TRAVERSE_DISCOVER, - onfinish); // on completion|error + path_traverse(fpath, onfinish->trace, true, + m, new C_MDS_RetryMessage(mds,m), // on delay/retry + MDS_TRAVERSE_DISCOVER, + onfinish); // on completion|error } void MDCache::handle_export_dir_discover_2(MExportDirDiscover *m, CInode *in, int r) @@ -6358,7 +6363,6 @@ void MDCache::handle_export_dir(MExportDir *m) dir_state.claim( m->get_state() ); int off = 0; int num_imported_inodes = 0; - timepair_t now = g_clock.gettimepair(); // for popularity adjustments for (int i = 0; i < m->get_ndirs(); i++) { num_imported_inodes += @@ -6366,8 +6370,7 @@ void MDCache::handle_export_dir(MExportDir *m) off, oldauth, dir, // import root - imported_subdirs, - now); + imported_subdirs); } dout(10) << " " << imported_subdirs.size() << " imported subdirs" << endl; dout(10) << " " << m->get_exports().size() << " imported nested exports" << endl; @@ -6463,7 +6466,7 @@ void MDCache::handle_export_dir_finish(MExportDirFinish *m) } -void MDCache::decode_import_inode(CDentry *dn, bufferlist& bl, int& off, int oldauth, timepair_t& now) +void MDCache::decode_import_inode(CDentry *dn, bufferlist& bl, int& off, int oldauth) { CInodeExport istate; @@ -6486,7 +6489,7 @@ void MDCache::decode_import_inode(CDentry *dn, bufferlist& bl, int& off, int old } // state after link - istate.update_inode(in, now); + istate.update_inode(in); // add inode? @@ -6531,8 +6534,7 @@ int MDCache::import_dir_block(bufferlist& bl, int& off, int oldauth, CDir *import_root, - list& imported_subdirs, - timepair_t& now) + list& imported_subdirs) { // set up dir CDirExport dstate; @@ -6550,7 +6552,7 @@ int MDCache::import_dir_block(bufferlist& bl, imported_subdirs.push_back(dir->ino()); // assimilate state - dstate.update_dir( dir, now ); + dstate.update_dir( dir ); if (diri->is_auth()) dir->dir_auth = CDIR_AUTH_PARENT; // update_dir may hose dir_auth // mark (may already be marked from get_or_open_dir() above) @@ -6616,7 +6618,7 @@ int MDCache::import_dir_block(bufferlist& bl, } else if (icode == 'I') { // inode - decode_import_inode(dn, bl, off, oldauth, now); + decode_import_inode(dn, bl, off, oldauth); } // mark dentry dirty? (only _after_ we link the inode!) @@ -7474,7 +7476,7 @@ vector MDCache::hack_add_file(string& fn, CInode *in) { } CInode* MDCache::hack_get_file(string& fn) { - int off = 1; + unsigned off = 1; CInode *cur = root; // dirs diff --git a/ceph/mds/MDCache.h b/ceph/mds/MDCache.h index 1fd49159f065b..1315467197e18 100644 --- a/ceph/mds/MDCache.h +++ b/ceph/mds/MDCache.h @@ -139,11 +139,11 @@ class MDCache { } // cache - size_t set_cache_size(size_t max) { + void set_cache_size(size_t max) { lru.lru_set_max(max); } - size_t get_cache_size() { lru.lru_get_size(); } - bool trim(__int32_t max = -1); // trim cache + size_t get_cache_size() { return lru.lru_get_size(); } + bool trim(int max = -1); // trim cache // shutdown void shutdown_start(); @@ -274,7 +274,7 @@ class MDCache { // -- file i/o -- __uint64_t issue_file_data_version(CInode *in); - int issue_file_caps(CInode *in, int mode, MClientRequest *req); + Capability* issue_file_caps(CInode *in, int mode, MClientRequest *req); void eval_file_caps(CInode *in); void handle_client_file_caps(class MClientFileCaps *m); @@ -329,13 +329,12 @@ class MDCache { int& off, int oldauth, CDir *import_root, - list& imported_subdirs, - timepair_t& now); + list& imported_subdirs); void got_hashed_replica(CDir *import, inodeno_t dir_ino, inodeno_t replica_ino); - void decode_import_inode(CDentry *dn, bufferlist& bl, int &off, int oldauth, timepair_t& now); + void decode_import_inode(CDentry *dn, bufferlist& bl, int &off, int oldauth); // bystander void handle_export_dir_warning(MExportDirWarning *m); diff --git a/ceph/mds/MDCluster.cc b/ceph/mds/MDCluster.cc index a5ed69420e248..5d70d1c38f26e 100644 --- a/ceph/mds/MDCluster.cc +++ b/ceph/mds/MDCluster.cc @@ -9,7 +9,7 @@ using namespace std; #include #include -#include "include/config.h" +#include "config.h" #define HASHDIR_OID_MULT (0x100 * 0x100000000LL) // 40 bits (~1 trillion) diff --git a/ceph/mds/MDLog.cc b/ceph/mds/MDLog.cc index 48ba97f8e8a16..847364c3bff7e 100644 --- a/ceph/mds/MDLog.cc +++ b/ceph/mds/MDLog.cc @@ -12,7 +12,7 @@ LogType mdlog_logtype; -#include "include/config.h" +#include "config.h" #undef dout #define dout(l) if (mds->get_nodeid() == 0 && (l<=g_conf.debug || l<=g_conf.debug_mds_log)) cout << "mds" << mds->get_nodeid() << ".log " @@ -42,7 +42,7 @@ MDLog::~MDLog() -int MDLog::submit_entry( LogEvent *e, +void MDLog::submit_entry( LogEvent *e, Context *c ) { dout(5) << "submit_entry at " << num_events << endl; @@ -64,7 +64,6 @@ int MDLog::submit_entry( LogEvent *e, delete c; } } - } void MDLog::wait_for_sync( Context *c ) @@ -147,7 +146,7 @@ void MDLog::trim(Context *c) delete le; logger->inc("obs"); } else { - if (trimming.size() < g_conf.mds_log_max_trimming) { + if ((int)trimming.size() < g_conf.mds_log_max_trimming) { // trim! dout(7) << " trimming " << le << endl; trimming.insert(le); diff --git a/ceph/mds/MDLog.h b/ceph/mds/MDLog.h index 6277d9264b0e4..ca1c8d810b2d9 100644 --- a/ceph/mds/MDLog.h +++ b/ceph/mds/MDLog.h @@ -64,8 +64,8 @@ class MDLog { return num_events + trimming.size(); } - int submit_entry( LogEvent *e, - Context *c = 0 ); + void submit_entry( LogEvent *e, + Context *c = 0 ); void wait_for_sync( Context *c ); void flush(); diff --git a/ceph/mds/MDS.cc b/ceph/mds/MDS.cc index be2eb45bb1cd0..5f75c1e0c221f 100644 --- a/ceph/mds/MDS.cc +++ b/ceph/mds/MDS.cc @@ -50,18 +50,13 @@ using namespace std; LogType mds_logtype, mds_cache_logtype; -#include "include/config.h" +#include "config.h" #undef dout #define dout(l) if (l<=g_conf.debug) cout << "mds" << whoami << " " #define dout3(l,mds) if (l<=g_conf.debug) cout << "mds" << mds->get_nodeid() << " " -ostream& operator<<(ostream& out, MDS& mds) -{ - out << "mds" << mds.get_nodeid() << " "; -} - void C_MDS_RetryMessage::redelegate(MDS *mds, int newmds) { // forward message to new mds @@ -113,7 +108,7 @@ MDS::MDS(MDCluster *mdc, int whoami, Messenger *m) { mds_paused = false; stat_ops = 0; - last_balancer_heartbeat = g_clock.gettimepair(); + last_balancer_heartbeat = g_clock.recent_now(); // log string name; @@ -185,6 +180,7 @@ MDS::~MDS() { int MDS::init() { + return 0; } @@ -202,6 +198,7 @@ int MDS::shutdown_start() if (idalloc) idalloc->shutdown(); handle_shutdown_start(NULL); + return 0; } @@ -234,7 +231,7 @@ void MDS::handle_shutdown_finish(Message *m) did_shut_down.insert(mds); dout(1) << " shut down so far: " << did_shut_down << endl; - if (did_shut_down.size() == mdcluster->get_num_mds()) { + if (did_shut_down.size() == (unsigned)mdcluster->get_num_mds()) { // MDS's all shut down! // shut down osd's @@ -271,15 +268,14 @@ int MDS::shutdown_final() mds_load_t MDS::get_load() { - timepair_t now = g_clock.gettimepair(); mds_load_t l; if (mdcache->get_root()) { - l.root_pop = mdcache->get_root()->popularity[MDS_POP_ANYDOM].get(now); + l.root_pop = mdcache->get_root()->popularity[MDS_POP_ANYDOM].get(); } else l.root_pop = 0; - l.req_rate = stat_req.get(now); - l.rd_rate = stat_read.get(now); - l.wr_rate = stat_write.get(now); + l.req_rate = stat_req.get(); + l.rd_rate = stat_read.get(); + l.wr_rate = stat_write.get(); return l; } @@ -369,10 +365,12 @@ void MDS::proc_message(Message *m) void MDS::dispatch(Message *m) { - mds_lock.Lock(); + // make sure we advacne the clock + g_clock.now(); + // process + mds_lock.Lock(); my_dispatch(m); - mds_lock.Unlock(); } @@ -447,24 +445,12 @@ void MDS::my_dispatch(Message *m) finish_contexts(ls); } - // balance? - static int num_bal_times = g_conf.mds_bal_max; - static timepair_t first = g_clock.gettimepair(); - timepair_t now = g_clock.gettimepair(); - timepair_t elapsed = now - first; - if (true && - whoami == 0 && - (num_bal_times || (g_conf.mds_bal_max_until >= 0 && elapsed.first > g_conf.mds_bal_max_until)) && - !shutting_down && !shut_down && - now.first - last_balancer_heartbeat.first >= g_conf.mds_bal_interval) { - last_balancer_heartbeat = now; - balancer->send_heartbeat(); - num_bal_times--; - } - - // periodic logging crap - static timepair_t last_log = g_clock.gettimepair(); - if (last_log.first != now.first) { + // periodic crap (second resolution) + static utime_t last_log = g_clock.recent_now(); + utime_t now = g_clock.recent_now(); + if (last_log.sec() != now.sec()) { + + // log last_log = now; mds_load_t load = get_load(); @@ -476,6 +462,21 @@ void MDS::my_dispatch(Message *m) logger->set("cptail", mdcache->lru.lru_get_pintail()); logger->set("buf", buffer_total_alloc); + // balance? + static int num_bal_times = g_conf.mds_bal_max; + static utime_t first = g_clock.recent_now(); + utime_t elapsed = now; + elapsed -= first; + if (true && + whoami == 0 && + (num_bal_times || (g_conf.mds_bal_max_until >= 0 && elapsed.sec() > g_conf.mds_bal_max_until)) && + !shutting_down && !shut_down && + now.sec() - last_balancer_heartbeat.sec() >= g_conf.mds_bal_interval) { + last_balancer_heartbeat = now; + balancer->send_heartbeat(); + num_bal_times--; + } + } // hack @@ -654,8 +655,7 @@ void MDS::reply_request(MClientRequest *req, MClientReply *reply, CInode *tracei // include trace if (tracei) { - timepair_t now = g_clock.gettimepair(); - reply->set_trace_dist( tracei, whoami, now ); + reply->set_trace_dist( tracei, whoami ); } // send reply @@ -740,7 +740,7 @@ void MDS::handle_client_request(MClientRequest *req) case MDS_OP_TRUNCATE: if (!req->get_ino()) break; // can be called w/ either fh OR path - case MDS_OP_CLOSE: + case MDS_OP_RELEASE: case MDS_OP_FSYNC: ref = mdcache->get_inode(req->get_ino()); // fixme someday no ino needed? @@ -864,8 +864,8 @@ void MDS::dispatch_request(Message *m, CInode *ref) handle_client_fsync(req, ref); break; */ - case MDS_OP_CLOSE: - handle_client_close(req, ref); + case MDS_OP_RELEASE: + handle_client_release(req, ref); break; // inodes @@ -937,9 +937,8 @@ void MDS::handle_client_stat(MClientRequest *req, mdcache->inode_soft_read_finish(ref); - timepair_t now = g_clock.gettimepair(); - stat_read.hit(now); - stat_req.hit(now); + stat_read.hit(); + stat_req.hit(); balancer->hit_inode(ref); @@ -961,10 +960,8 @@ void MDS::handle_client_utime(MClientRequest *req, return; // fw or (wait for) sync // do update - time_t mtime = req->get_targ(); - time_t atime = req->get_targ2(); - cur->inode.mtime = mtime; - cur->inode.atime = mtime; + cur->inode.mtime = req->get_targ(); + cur->inode.atime = req->get_targ2(); if (cur->is_auth()) cur->mark_dirty(); @@ -1107,7 +1104,6 @@ void MDS::handle_client_readdir(MClientRequest *req, MClientReply *reply = new MClientReply(req); // build dir contents - timepair_t now = g_clock.gettimepair(); CDir_map_t::iterator it; int numfiles = 0; for (it = cur->dir->begin(); it != cur->dir->end(); it++) { @@ -1126,15 +1122,15 @@ void MDS::handle_client_readdir(MClientRequest *req, // add this item // note: c_inode_info makes note of whether inode data is readable. - reply->add_dir_item(new c_inode_info(in, whoami, it->first, now)); + reply->add_dir_item(new c_inode_info(in, whoami, it->first)); numfiles++; } dout(10) << "reply to " << *req << " readdir " << numfiles << " files" << endl; reply->set_result(0); - stat_read.hit(now); - stat_req.hit(now); + stat_read.hit(); + stat_req.hit(); balancer->hit_dir(cur->dir); @@ -1866,7 +1862,6 @@ void MDS::handle_client_rename_2(MClientRequest *req, assert(srci); CDir* destdir = 0; string destname; - bool result; // what is the dest? (dir or file or complete filename) // note: trace includes root, destpath doesn't (include leading /) @@ -2017,7 +2012,7 @@ void MDS::handle_client_rename_local(MClientRequest *req, CDentry *destdn, string& destname) { - bool everybody = false; + //bool everybody = false; //if (true || srcdn->inode->is_dir()) { /* overkill warning: lock w/ everyone for simplicity. FIXME someday! along with the foreign rename crap! i could limit this to cases where something beneath me is exported. @@ -2280,13 +2275,13 @@ void MDS::handle_client_open(MClientRequest *req, // mode! int mode = 0; if (flags & O_WRONLY) - mode = CFILE_MODE_W; + mode = FILE_MODE_W; else if (flags & O_RDWR) - mode = CFILE_MODE_RW; + mode = FILE_MODE_RW; else if (flags & O_APPEND) - mode = CFILE_MODE_W; + mode = FILE_MODE_W; else - mode = CFILE_MODE_R; + mode = FILE_MODE_R; dout(10) << " flags = " << flags << " mode = " << mode << endl; @@ -2302,8 +2297,8 @@ void MDS::handle_client_open(MClientRequest *req, assert(cur->is_auth()); // writer? - if (mode == CFILE_MODE_W || - mode == CFILE_MODE_RW) { + if (mode == FILE_MODE_W || + mode == FILE_MODE_RW) { if (!mdcache->inode_soft_write_start(cur, req)) return; } @@ -2313,30 +2308,22 @@ void MDS::handle_client_open(MClientRequest *req, // can we issue the caps they want? __uint64_t fdv = mdcache->issue_file_data_version(cur); - int caps = mdcache->issue_file_caps(cur, mode, req); - if (!caps) return; // can't issue (yet), so wait! - - // create fh - CFile *f = new CFile(); - f->mode = mode; - f->client = req->get_client(); - f->fh = idalloc->get_id(ID_FH); - f->pending_caps = f->confirmed_caps = caps; - f->last_sent = f->last_recv = 0; // none yet! - cur->add_fh(f); - - dout(12) << "new fh " << f->fh << " gets caps " << caps << endl; - - if (mode == CFILE_MODE_W || - mode == CFILE_MODE_RW) { + Capability *cap = mdcache->issue_file_caps(cur, mode, req); + if (!cap) return; // can't issue (yet), so wait! + + dout(12) << "open gets caps " << cap->pending() << endl; + + if (mode == FILE_MODE_W || + mode == FILE_MODE_RW) { mdcache->inode_soft_write_finish(cur); } balancer->hit_inode(cur); // reply - MClientReply *reply = new MClientReply(req, f->fh); // fh # is return code - reply->set_file_caps(caps); + MClientReply *reply = new MClientReply(req, 0); // fh # is return code + reply->set_file_caps(cap->pending()); + reply->set_file_caps_seq(cap->get_last_seq()); reply->set_file_data_version(fdv); reply_request(req, reply, cur); } @@ -2359,13 +2346,13 @@ void MDS::handle_client_openc(MClientRequest *req, CInode *ref) -void MDS::handle_client_close(MClientRequest *req, CInode *cur) +void MDS::handle_client_release(MClientRequest *req, CInode *cur) { // auth only if (!cur->is_auth()) { int auth = cur->authority(); assert(auth != whoami); - dout(9) << "close " << *cur << " on replica, fw to auth " << auth << endl; + dout(9) << "release " << *cur << " on replica, fw to auth " << auth << endl; mdcache->request_forward(req, auth); return; @@ -2374,17 +2361,16 @@ void MDS::handle_client_close(MClientRequest *req, CInode *cur) // verify on read or write list int client = req->get_client(); - int fh = req->get_iarg(); - CFile *f = cur->get_fh(fh); - if (!f) { - dout(1) << "close on unopen fh " << fh << " inode " << *cur << endl; + Capability *cap = cur->get_cap(client); + if (!cap) { + dout(1) << "release on non-existant capability client " << client << " inode " << *cur << endl; assert(0); } - - dout(10) << "close on " << *cur << ", fh=" << f->fh << " mode=" << f->mode << endl; + + dout(10) << "release on " << *cur << " client " << client << endl; // update soft metadata - if (f->confirmed_caps & (CFILE_CAP_WR|CFILE_CAP_WRBUFFER)) { + if (cap->issued() & CAP_FILE_WR) { assert(cur->softlock.can_write(true)); // otherwise we're toast??? if (!mdcache->inode_soft_write_start(cur, req)) return; // wait @@ -2403,47 +2389,38 @@ void MDS::handle_client_close(MClientRequest *req, CInode *cur) dout(10) << " extended size to " << size << endl; cur->mark_dirty(); } - } - - // atime? **** - // were we writing? - int had_caps = f->pending_caps & f->confirmed_caps; // safe set of caps the client can assume it had - - if ((f->confirmed_caps | f->pending_caps) & CFILE_CAP_WR != 0) { // inc file_data_version dout(7) << " incrementing file_data_version for " << *cur << endl; cur->inode.file_data_version++; - } - - // close it. - cur->remove_fh(f); - - // reclaim fh - //idalloc->reclaim_id(ID_FH, f->fh); // don't do this for now.... we'll rewrite this anyway! FIXME sage - - // ok we're done - if (f->confirmed_caps & (CFILE_CAP_WR|CFILE_CAP_WRBUFFER)) { - dout(7) << "soft write fnish" << endl; + // release write mdcache->inode_soft_write_finish(cur); mdcache->inode_soft_eval(cur); + } else { + dout(10) << "no WR caps issued, not updating mtime/size" << endl; } - mdcache->eval_file_caps(cur); + // XXX what about atime? - // hose CFile - delete f; - // XXX what about atime? + // release it. + int had_caps = cap->issued(); + long had_caps_seq = cap->get_last_seq(); + cur->remove_cap(client); + + // reeval caps + mdcache->eval_file_caps(cur); + // give back a file_data_version to client MClientReply *reply = new MClientReply(req, 0); __uint64_t fdv = mdcache->issue_file_data_version(cur); reply->set_file_caps(had_caps); + reply->set_file_caps_seq(had_caps_seq); reply->set_file_data_version(fdv); - + // commit commit_request(req, reply, cur, new EInodeUpdate(cur)); // FIXME wrong message? diff --git a/ceph/mds/MDS.h b/ceph/mds/MDS.h index a927624691be6..08be9246ed2b9 100644 --- a/ceph/mds/MDS.h +++ b/ceph/mds/MDS.h @@ -136,7 +136,7 @@ class MDS : public Dispatcher { protected: __uint64_t stat_ops; - timepair_t last_balancer_heartbeat; + utime_t last_balancer_heartbeat; public: MDS(MDCluster *mdc, int whoami, Messenger *m); @@ -241,7 +241,7 @@ class MDS : public Dispatcher { // file void handle_client_open(MClientRequest *req, CInode *ref); void handle_client_openc(MClientRequest *req, CInode *ref); - void handle_client_close(MClientRequest *req, CInode *in); + void handle_client_release(MClientRequest *req, CInode *in); void handle_client_truncate(MClientRequest *req, CInode *in); void handle_client_fsync(MClientRequest *req, CInode *in); diff --git a/ceph/mds/MDStore.cc b/ceph/mds/MDStore.cc index 0415cefe92655..8027d379b2753 100644 --- a/ceph/mds/MDStore.cc +++ b/ceph/mds/MDStore.cc @@ -17,7 +17,7 @@ using namespace std; -#include "include/config.h" +#include "config.h" #undef dout #define dout(l) if (l<=g_conf.debug) cout << "mds" << mds->get_nodeid() << ".store " @@ -57,7 +57,7 @@ class MDFetchDirContext : public Context { }; -bool MDStore::fetch_dir( CDir *dir, +void MDStore::fetch_dir( CDir *dir, Context *c ) { dout(7) << "fetch_dir " << *dir << " context is " << c << endl; @@ -69,7 +69,7 @@ bool MDStore::fetch_dir( CDir *dir, // already fetching? if (dir->state_test(CDIR_STATE_FETCHING)) { dout(7) << "already fetching " << *dir << "; waiting" << endl; - return true; + return; } dir->state_set(CDIR_STATE_FETCHING); @@ -86,14 +86,14 @@ bool MDStore::fetch_dir( CDir *dir, do_fetch_dir( dir, fin ); // normal } -bool MDStore::fetch_dir_2( int result, +void MDStore::fetch_dir_2( int result, inodeno_t ino) { CInode *idir = mds->mdcache->get_inode(ino); if (result < 0) - dout(7) << *mds << "fetch_dir_2 failed on " << ino << endl; + dout(7) << "fetch_dir_2 failed on " << ino << endl; - if (!idir) return false; + if (!idir) return; assert(idir); assert(idir->dir); @@ -193,7 +193,7 @@ class C_MDS_CommitDirFinish : public Context { -bool MDStore::commit_dir( CDir *dir, +void MDStore::commit_dir( CDir *dir, Context *c ) { assert(dir->is_dirty()); @@ -210,7 +210,7 @@ bool MDStore::commit_dir( CDir *dir, commit_dir(dir, dir->get_version(), c); } -bool MDStore::commit_dir( CDir *dir, +void MDStore::commit_dir( CDir *dir, __uint64_t version, Context *c ) { @@ -228,7 +228,7 @@ bool MDStore::commit_dir( CDir *dir, dir->add_waiter(CDIR_WAIT_COMMITTED, new C_MDS_CommitDirVerify(mds, dir->ino(), version, c) ); - return false; + return; } if (!dir->can_auth_pin()) { @@ -236,7 +236,7 @@ bool MDStore::commit_dir( CDir *dir, dout(7) << "commit_dir " << *dir << " can't auth_pin, waiting" << endl; dir->add_waiter(CDIR_WAIT_AUTHPINNABLE, new C_MDS_CommitDirVerify(mds, dir->ino(), version, c) ); - return false; + return; } @@ -246,7 +246,7 @@ bool MDStore::commit_dir( CDir *dir, // fetch dir first fetch_dir(dir, new C_MDS_CommitDirVerify(mds, dir->ino(), version, c) ); - return false; + return; } @@ -275,7 +275,7 @@ bool MDStore::commit_dir( CDir *dir, } } -bool MDStore::commit_dir_2( int result, +void MDStore::commit_dir_2( int result, CDir *dir, __uint64_t committed_version) { @@ -587,14 +587,14 @@ void MDStore::do_fetch_dir_2( bufferlist& bl, { CInode *idir = mds->mdcache->get_inode(ino); if (!idir) { - dout(7) << *mds << "do_fetch_dir_2 on ino " << ino << " but no longer in our cache!" << endl; + dout(7) << "do_fetch_dir_2 on ino " << ino << " but no longer in our cache!" << endl; c->finish(-1); delete c; return; } if (!idir->dir_is_auth()) { - dout(7) << *mds << "do_fetch_dir_2 on " << *idir << ", but i'm not auth" << endl; + dout(7) << "do_fetch_dir_2 on " << *idir << ", but i'm not auth" << endl; c->finish(-1); delete c; return; @@ -604,7 +604,7 @@ void MDStore::do_fetch_dir_2( bufferlist& bl, CDir *dir = idir->get_or_open_dir(mds); // do it - dout(7) << *mds << "do_fetch_dir_2 hashcode " << hashcode << " dir " << *dir << endl; + dout(7) << "do_fetch_dir_2 hashcode " << hashcode << " dir " << *dir << endl; // parse buffer contents into cache dout(15) << "bl is " << bl << endl; @@ -624,7 +624,7 @@ void MDStore::do_fetch_dir_2( bufferlist& bl, dout(10) << " " << num << " items in " << size << " bytes" << endl; - int parsed = 0; + unsigned parsed = 0; while (parsed < num) { assert(p < buflen && num > 0); parsed++; diff --git a/ceph/mds/MDStore.h b/ceph/mds/MDStore.h index 7c8b91adb1837..b931b29a61a6b 100644 --- a/ceph/mds/MDStore.h +++ b/ceph/mds/MDStore.h @@ -26,14 +26,14 @@ class MDStore { } // basic interface (normal or unhashed) - bool fetch_dir( CDir *dir, + void fetch_dir( CDir *dir, Context *c ); - bool fetch_dir_2( int result, + void fetch_dir_2( int result, inodeno_t ino ); - bool commit_dir( CDir *dir, Context *c ); // commit current dir version to disk. - bool commit_dir( CDir *dir, __uint64_t version, Context *c ); // commit specified version to disk - bool commit_dir_2( int result, CDir *dir, __uint64_t committed_version ); + void commit_dir( CDir *dir, Context *c ); // commit current dir version to disk. + void commit_dir( CDir *dir, __uint64_t version, Context *c ); // commit specified version to disk + void commit_dir_2( int result, CDir *dir, __uint64_t committed_version ); // low level committer void do_commit_dir( CDir *dir, diff --git a/ceph/mds/OSDMonitor.cc b/ceph/mds/OSDMonitor.cc index 75e846cdd2e6b..023a87a176fed 100644 --- a/ceph/mds/OSDMonitor.cc +++ b/ceph/mds/OSDMonitor.cc @@ -14,7 +14,7 @@ #include "common/Timer.h" #include "common/Clock.h" -#include "include/config.h" +#include "config.h" #undef dout #define dout(l) if (l<=g_conf.debug) cout << "mds" << mds->get_nodeid() << ".osdmon: " diff --git a/ceph/mds/events/EAlloc.h b/ceph/mds/events/EAlloc.h index 2215f8e6c231f..7e174c4c619c4 100644 --- a/ceph/mds/events/EAlloc.h +++ b/ceph/mds/events/EAlloc.h @@ -2,7 +2,7 @@ #define __EALLOC_H #include -#include "include/config.h" +#include "config.h" #include "include/types.h" #include "../LogEvent.h" diff --git a/ceph/mds/events/EInodeUpdate.h b/ceph/mds/events/EInodeUpdate.h index a09f6e5cdbb75..535c21f072b03 100644 --- a/ceph/mds/events/EInodeUpdate.h +++ b/ceph/mds/events/EInodeUpdate.h @@ -2,7 +2,7 @@ #define __EINODEUPDATE_H #include -#include "include/config.h" +#include "config.h" #include "include/types.h" #include "../LogEvent.h" #include "../CInode.h" diff --git a/ceph/mds/events/EUnlink.h b/ceph/mds/events/EUnlink.h index 051b92cd95d0a..115b26300283a 100644 --- a/ceph/mds/events/EUnlink.h +++ b/ceph/mds/events/EUnlink.h @@ -2,7 +2,7 @@ #define __EUNLINK_H #include -#include "include/config.h" +#include "config.h" #include "include/types.h" #include "../LogEvent.h" #include "../CInode.h" diff --git a/ceph/messages/MAnchorReply.h b/ceph/messages/MAnchorReply.h index 96ad136f8f46d..00e040cb2919d 100644 --- a/ceph/messages/MAnchorReply.h +++ b/ceph/messages/MAnchorReply.h @@ -21,7 +21,7 @@ class MAnchorReply : public Message { this->ino = req->get_ino(); } ~MAnchorReply() { - for (int i=0; iino = ino; } ~MAnchorRequest() { - for (int i=0; ilast_sent++; - this->seq = f->last_sent; + + this->seq = cap.get_last_seq(); + this->caps = cap.pending(); + this->inode = in->inode; - this->fh = f->fh; - this->caps = f->pending_caps; this->need_ack = need_ack; this->mds = new_mds; } @@ -39,22 +41,22 @@ class MClientFileCaps : public Message { off += sizeof(seq); s.copy(off, sizeof(inode), (char*)&inode); off += sizeof(inode); - s.copy(off, sizeof(fh), (char*)&fh); - off += sizeof(fh); s.copy(off, sizeof(mds), (char*)&mds); off += sizeof(mds); s.copy(off, sizeof(caps), (char*)&caps); off += sizeof(caps); s.copy(off, sizeof(need_ack), (char*)&need_ack); off += sizeof(need_ack); + s.copy(off, sizeof(client), (char*)&client); + off += sizeof(client); } virtual void encode_payload(crope& s) { s.append((char*)&seq, sizeof(seq)); s.append((char*)&inode, sizeof(inode)); - s.append((char*)&fh,sizeof(fh)); s.append((char*)&mds,sizeof(mds)); s.append((char*)&caps, sizeof(caps)); s.append((char*)&need_ack, sizeof(need_ack)); + s.append((char*)&client, sizeof(client)); } }; diff --git a/ceph/messages/MClientInodeAuthUpdate.h b/ceph/messages/MClientInodeAuthUpdate.h index 3272e0fc2b1ea..17aa9c3e7f15c 100644 --- a/ceph/messages/MClientInodeAuthUpdate.h +++ b/ceph/messages/MClientInodeAuthUpdate.h @@ -2,31 +2,29 @@ #define __MCLIENTINODEAUTHUPDATE_H class MClientInodeAuthUpdate : public Message { - //inodeno_t ino; - fileh_t fh; + inodeno_t ino; int newauth; public: - //inodeno_t get_ino() { return ino; } - fileh_t get_fh() { return fh; } + inodeno_t get_ino() { return ino; } int get_auth() { return newauth; } MClientInodeAuthUpdate() {} - MClientInodeAuthUpdate(fileh_t fh, int newauth) : + MClientInodeAuthUpdate(inodeno_t ino, int newauth) : Message(MSG_CLIENT_INODEAUTHUPDATE) { - this->fh = fh; + this->ino = ino; this->newauth = newauth; } virtual char *get_type_name() { return "Ciau";} virtual void decode_payload(crope& s, int& off) { - s.copy(off, sizeof(fh), (char*)&fh); - off += sizeof(fh); + s.copy(off, sizeof(ino), (char*)&ino); + off += sizeof(ino); s.copy(off, sizeof(newauth), (char*)&newauth); off += sizeof(newauth); } virtual void encode_payload(crope& s) { - s.append((char*)&fh,sizeof(fh)); + s.append((char*)&ino,sizeof(ino)); s.append((char*)&newauth,sizeof(newauth)); } }; diff --git a/ceph/messages/MClientMountAck.h b/ceph/messages/MClientMountAck.h index ad296ecc771a2..0facbf8c97e27 100644 --- a/ceph/messages/MClientMountAck.h +++ b/ceph/messages/MClientMountAck.h @@ -28,7 +28,7 @@ class MClientMountAck : public Message { int off = 0; payload.copy(off, sizeof(pcid), (char*)&pcid); off += sizeof(pcid); - if (off < payload.length()) + if ((unsigned)off < payload.length()) payload.splice( off, payload.length()-off, &osd_cluster_state); } virtual void encode_payload() { diff --git a/ceph/messages/MClientReply.h b/ceph/messages/MClientReply.h index 79b348254fade..a501772b2496d 100644 --- a/ceph/messages/MClientReply.h +++ b/ceph/messages/MClientReply.h @@ -49,7 +49,7 @@ class c_inode_info { public: c_inode_info() {} - c_inode_info(CInode *in, int whoami, string ref_dn, timepair_t& now) { + c_inode_info(CInode *in, int whoami, string ref_dn) { // inode this->inode = in->inode; this->inode_soft_valid = in->softlock.can_read(in->is_auth()); @@ -65,7 +65,7 @@ class c_inode_info { spec_defined = in->dir && in->dir->is_auth(); if (spec_defined) { dir_auth = in->dir->get_dir_auth(); - in->dir->get_dist_spec(this->dist, whoami, now); + in->dir->get_dist_spec(this->dist, whoami); } } @@ -108,6 +108,7 @@ typedef struct { int trace_depth; int dir_size; unsigned char file_caps; // for open + long file_caps_seq; __uint64_t file_data_version; // for client buffercache consistency } MClientReply_st; @@ -131,10 +132,12 @@ class MClientReply : public Message { const vector& get_trace() { return trace; } vector& get_dir_contents() { return dir_contents; } unsigned char get_file_caps() { return st.file_caps; } + long get_file_caps_seq() { return st.file_caps_seq; } __uint64_t get_file_data_version() { return st.file_data_version; } void set_result(int r) { st.result = r; } void set_file_caps(unsigned char c) { st.file_caps = c; } + void set_file_caps_seq(long s) { st.file_caps_seq = s; } void set_file_data_version(__uint64_t v) { st.file_data_version = v; } MClientReply() {}; @@ -204,14 +207,14 @@ class MClientReply : public Message { dir_contents.push_back(c); } - void set_trace_dist(CInode *in, int whoami, timepair_t& now) { + void set_trace_dist(CInode *in, int whoami) { while (in) { // add this inode to trace, along with referring dentry name string ref_dn; CDentry *dn = in->get_parent_dn(); if (dn) ref_dn = dn->get_name(); - trace.insert(trace.begin(), new c_inode_info(in, whoami, ref_dn, now)); + trace.insert(trace.begin(), new c_inode_info(in, whoami, ref_dn)); in = in->get_parent_inode(); } diff --git a/ceph/messages/MClientRequest.h b/ceph/messages/MClientRequest.h index 02b372e77d949..41283364ef3f8 100644 --- a/ceph/messages/MClientRequest.h +++ b/ceph/messages/MClientRequest.h @@ -161,8 +161,8 @@ inline ostream& operator<<(ostream& out, MClientRequest& req) { out << "truncate"; break; case MDS_OP_FSYNC: out << "fsync"; break; - case MDS_OP_CLOSE: - out << "close"; break; + case MDS_OP_RELEASE: + out << "release"; break; default: out << "unknown=" << req.get_op(); } diff --git a/ceph/msg/CheesySerializer.cc b/ceph/msg/CheesySerializer.cc index a73db49fdcadd..cb3c179c1c1fc 100644 --- a/ceph/msg/CheesySerializer.cc +++ b/ceph/msg/CheesySerializer.cc @@ -6,7 +6,7 @@ #include using namespace std; -#include "include/config.h" +#include "config.h" #undef dout #define dout(l) if (l<=g_conf.debug) cout << "serializer: " diff --git a/ceph/msg/FakeMessenger.cc b/ceph/msg/FakeMessenger.cc index 921636a95879b..20c954e95a3ae 100644 --- a/ceph/msg/FakeMessenger.cc +++ b/ceph/msg/FakeMessenger.cc @@ -1,6 +1,4 @@ - - #include "Message.h" #include "FakeMessenger.h" #include "mds/MDS.h" @@ -10,7 +8,7 @@ #include "common/LogType.h" #include "common/Logger.h" -#include "include/config.h" +#include "config.h" #include #include @@ -28,12 +26,10 @@ using namespace __gnu_cxx; #include "common/Mutex.h" #include -#include "include/config.h" - // global queue. -map directory; +map directory; hash_map loggers; LogType fakemsg_logtype; @@ -72,6 +68,8 @@ void *fakemessenger_thread(void *ptr) if (shutdown) break; fakemessenger_do_loop_2(); + + if (directory.empty()) break; } lock.Unlock(); @@ -79,6 +77,7 @@ void *fakemessenger_thread(void *ptr) g_timer.unset_messenger_kicker(); dout(1) << "thread finish (i woke up but no messages, bye)" << endl; + return 0; } @@ -101,7 +100,6 @@ void fakemessenger_wait() cout << "fakemessenger_wait waiting" << endl; void *ptr; pthread_join(thread_id, &ptr); - } @@ -116,6 +114,7 @@ int fakemessenger_do_loop() lock.Unlock(); g_timer.shutdown(); + return 0; } @@ -213,12 +212,7 @@ FakeMessenger::FakeMessenger(long me) : Messenger(me) FakeMessenger::~FakeMessenger() { - shutdown(); - if (loggers[whoami]) { - delete loggers[whoami]; - loggers.erase(whoami); - } } @@ -226,13 +220,24 @@ int FakeMessenger::shutdown() { //cout << "shutdown on messenger " << this << " has " << num_incoming() << " queued" << endl; lock.Lock(); + + assert(directory.count(whoami) == 1); directory.erase(whoami); if (directory.empty()) { dout(1) << "fakemessenger: last shutdown" << endl; ::shutdown = true; cond.Signal(); // why not } + + /* + if (loggers[whoami]) { + delete loggers[whoami]; + loggers.erase(whoami); + } + */ + lock.Unlock(); + return 0; } /* @@ -284,13 +289,12 @@ int FakeMessenger::send_message(Message *m, msg_addr_t dest, int port, int fromp // wake up loop? if (!awake) { dout(10) << "waking up fakemessenger thread" << endl; - awake = true; - lock.Unlock(); cond.Signal(); + lock.Unlock(); } else lock.Unlock(); - - + + return 0; } diff --git a/ceph/msg/HostMonitor.cc b/ceph/msg/HostMonitor.cc index 7d35712a9717a..f7000049dcb14 100644 --- a/ceph/msg/HostMonitor.cc +++ b/ceph/msg/HostMonitor.cc @@ -15,7 +15,7 @@ #define DBL 10 -#include "include/config.h" +#include "config.h" #undef dout #define dout(l) if (l<=g_conf.debug) cout << whoami << " hostmon: " @@ -63,7 +63,7 @@ void HostMonitor::init() notify_retry_interval = 10; // schedule first hb - schedule_heartbeat(); + //schedule_heartbeat(); } diff --git a/ceph/msg/MPIMessenger.cc b/ceph/msg/MPIMessenger.cc index 4754b966eb34e..379ce1869c5c1 100644 --- a/ceph/msg/MPIMessenger.cc +++ b/ceph/msg/MPIMessenger.cc @@ -1,5 +1,5 @@ -#include "include/config.h" +#include "config.h" #include "include/error.h" #include "common/Timer.h" @@ -51,11 +51,11 @@ bool pending_timer; // our lock for any common data; it's okay to have only the one global mutex // because our common data isn't a whole lot. -static pthread_mutex_t mutex; +//static pthread_mutex_t mutex; // the number of distinct threads we've seen so far; used to generate // a unique tag for each thread. -static int nthreads = 10; +//static int nthreads = 10; //#define TAG_UNSOLICITED 0 @@ -93,6 +93,7 @@ int mpimessenger_shutdown() MPI_Barrier (MPI_COMM_WORLD); dout(1) << "mpimessenger_shutdown all done, MPI_Finalize()" << endl; MPI_Finalize(); + return 0; } int mpimessenger_world() @@ -266,7 +267,7 @@ int mpi_send(Message *m, int tag) it != m->get_payload().buffers().end(); it++) { dout(DBLVL) << "mpi_sending frag " << i << " len " << (*it).length() << endl; - MPI_Request *req = new MPI_Request; + //MPI_Request *req = new MPI_Request; ASSERT(MPI_Isend((void*)(*it).c_str(), (*it).length(), MPI_CHAR, @@ -286,6 +287,7 @@ int mpi_send(Message *m, int tag) #ifndef FUNNEL_MPI sender_lock.Unlock(); #endif + return 0; } @@ -402,6 +404,7 @@ void* mpimessenger_loop(void*) g_timer.shutdown(); dout(5) << "mpimessenger_loop exiting loop" << endl; + return 0; } @@ -415,6 +418,7 @@ int mpimessenger_start() NULL, mpimessenger_loop, 0); + return 0; } @@ -547,6 +551,7 @@ int MPIMessenger::shutdown() } else { dout(10) << "shutdown still " << directory.size() << " other messengers on rank " << mpi_rank << endl; } + return 0; } @@ -580,6 +585,7 @@ int MPIMessenger::send_message(Message *m, msg_addr_t dest, int port, int frompo mpi_send(m, m->get_pcid()); #endif + return 0; } diff --git a/ceph/msg/Message.h b/ceph/msg/Message.h index 6223778ec2f05..8292837e55a62 100644 --- a/ceph/msg/Message.h +++ b/ceph/msg/Message.h @@ -209,7 +209,7 @@ class Message { int off = 0; decode_payload(ser, off); - assert(off == payload.length()); + assert((unsigned)off == payload.length()); } virtual void encode_payload() { assert(payload.length() == 0); // caller should reset payload diff --git a/ceph/msg/Messenger.cc b/ceph/msg/Messenger.cc index 545cc76d0affc..546f7e3b533b0 100644 --- a/ceph/msg/Messenger.cc +++ b/ceph/msg/Messenger.cc @@ -1,7 +1,6 @@ #include #include "include/types.h" -#include "include/config.h" #include "Message.h" #include "Messenger.h" @@ -28,7 +27,7 @@ using namespace std; #include "messages/MClientRequest.h" #include "messages/MClientReply.h" #include "messages/MClientFileCaps.h" -#include "messages/MClientInodeAuthUpdate.h" +//#include "messages/MClientInodeAuthUpdate.h" #include "messages/MDirUpdate.h" #include "messages/MDiscover.h" @@ -67,7 +66,7 @@ using namespace std; #include "messages/MLock.h" -#include "include/config.h" +#include "config.h" #undef dout #define dout(l) if (l<=g_conf.debug) cout << "messenger: " #define DEBUGLVL 10 // debug level of output @@ -100,8 +99,7 @@ void Messenger::dispatch(Message *m) // yes, this is a reply to a pending call. dout(DEBUGLVL) << "dispatch got reply for " << pcid << " " << m << endl; call_reply[pcid] = m; // set reply - int r = call_cond[pcid]->Signal(); - //cout << "post = " << r << endl; + call_cond[pcid]->Signal(); _lock.Unlock(); } else { // no, this is an unsolicited message. @@ -178,8 +176,6 @@ Message *Messenger::sendrecv(Message *m, msg_addr_t dest, int port) Message * decode_message(msg_envelope_t& env, bufferlist& payload) { - int type; - // make message Message *m = 0; switch(env.type) { @@ -228,9 +224,9 @@ decode_message(msg_envelope_t& env, bufferlist& payload) case MSG_CLIENT_FILECAPS: m = new MClientFileCaps(); break; - case MSG_CLIENT_INODEAUTHUPDATE: - m = new MClientInodeAuthUpdate(); - break; + // case MSG_CLIENT_INODEAUTHUPDATE: + //m = new MClientInodeAuthUpdate(); + //break; // mds case MSG_MDS_DIRUPDATE: diff --git a/ceph/msg/TCPMessenger.cc b/ceph/msg/TCPMessenger.cc index 51f1ec3730565..400176e9ad036 100644 --- a/ceph/msg/TCPMessenger.cc +++ b/ceph/msg/TCPMessenger.cc @@ -1,5 +1,5 @@ -#include "include/config.h" +#include "config.h" #include "include/error.h" #include "common/Timer.h" @@ -191,6 +191,8 @@ int tcpmessenger_shutdown() delete[] remote_addr; delete[] in_sd; delete[] out_sd; + + return 0; } int tcpmessenger_world() @@ -397,6 +399,7 @@ int tcp_send(Message *m) // hose message delete m; + return 0; } @@ -434,6 +437,7 @@ void* tcp_outthread(void*) } outgoing_lock.Unlock(); + return 0; } /** tcp_inthread @@ -510,6 +514,7 @@ void *tcp_acceptthread(void *) } } dout(DBL) << "got incoming from everyone!" << endl; + return 0; } @@ -586,8 +591,8 @@ void* tcp_dispatchthread(void*) g_timer.shutdown(); - dout(5) << "tcp_dispatchthread exiting loop" << endl; + return 0; } @@ -614,7 +619,7 @@ int tcpmessenger_start() NULL, tcp_outthread, 0); - + return 0; } @@ -710,14 +715,14 @@ int TCPMessenger::shutdown() // last one? if (lastone) { dout(2) << "shutdown last tcpmessenger on rank " << mpi_rank << " shut down" << endl; - pthread_t whoami = pthread_self(); + //pthread_t whoami = pthread_self(); // no more timer events g_timer.unset_messenger_kicker(); // close incoming sockets - void *r; + //void *r; for (int i=0; i +#include +using namespace std; + +template +class BDBMap { + private: + DB *dbp; + + public: + BDBMap() { + int r; + if ((r = db_create(&dbp, NULL, 0)) != 0) { + cerr << "db_create: " << db_strerror(r) << endl; + assert(0); + } + } + ~BDBMap() { + close(); + } + + // open/close + int open(const char *fn) { + int r = dbp->open(dbp, NULL, fn, NULL, DB_BTREE, DB_CREATE, 0644); + assert(r == 0); + return 0; + } + void close() { + dbp->close(dbp,0); + dbp = 0; + } + void remove(const char *fn) { + dbp->remove(dbp, fn, 0, 0); + dbp = 0; + } + + // accessors + int put(K key, + D data) { + DBT k; + k.data = &key; + k.size = sizeof(K); + DBT d; + d.data = &data; + d.size = sizeof(data); + return dbp->put(dbp, NULL, &k, &d, 0); + } + + int get(K key, + D& data) { + DBT k; + k.data = &key; + k.size = sizeof(key); + DBT d; + d.data = &data; + d.size = sizeof(data); + int r = dbp->get(dbp, NULL, &k, &d, 0); + return r; + } + + int del(K key) { + DBT k; + k.data = &key; + k.size = sizeof(key); + return dbp->del(dbp, NULL, &k, 0); + } + + int list_keys(list& ls) { + DBC *cursor = 0; + int r = dbp->cursor(dbp, NULL, &cursor, 0); + assert(r == 0); + + K key; + D data; + + DBT k,d; + k.data = &key; + k.size = sizeof(key); + d.data = &data; + d.size = sizeof(data); + + while (1) { + int r = cursor->c_get(cursor, &k, &d, DB_NEXT); + if (r == DB_NOTFOUND) break; + assert(r == 0); + ls.push_back(key); + } + cursor->c_close(cursor); + return 0; + } + +}; + +#endif diff --git a/ceph/osd/FakeStore.cc b/ceph/osd/FakeStore.cc index 27fd2b24c4dbe..2459b77c8bcf7 100644 --- a/ceph/osd/FakeStore.cc +++ b/ceph/osd/FakeStore.cc @@ -13,9 +13,9 @@ #include #include #include +#include - -#include "include/config.h" +#include "config.h" #undef dout #define dout(l) if (l<=g_conf.debug) cout << "osd" << whoami << ".fakestore " @@ -27,16 +27,10 @@ -FakeStore::FakeStore(char *base, int whoami, char *shadow) +FakeStore::FakeStore(char *base, int whoami) { this->basedir = base; this->whoami = whoami; - - if (shadow) { - is_shadow = true; - shadowdir = shadow; - } else - is_shadow = false; } @@ -46,9 +40,6 @@ int FakeStore::init() get_dir(mydir); dout(5) << "init with basedir " << mydir << endl; - if (is_shadow) { - dout(5) << " SHADOW dir is " << shadowdir << endl; - } // make sure global base dir exists struct stat st; @@ -66,6 +57,7 @@ int FakeStore::finalize() { dout(5) << "finalize" << endl; // nothing + return 0; } @@ -78,13 +70,10 @@ void FakeStore::get_dir(string& dir) { sprintf(s, "%d", whoami); dir = basedir + "/" + s; } -void FakeStore::get_oname(object_t oid, string& fn, bool shadow) { +void FakeStore::get_oname(object_t oid, string& fn) { char s[100]; sprintf(s, "%d/%02llx/%016llx", whoami, HASH_FUNC(oid), oid); - if (shadow) - fn = shadowdir + "/" + s; - else - fn = basedir + "/" + s; + fn = basedir + "/" + s; // dout(1) << "oname is " << fn << endl; } @@ -96,7 +85,7 @@ void FakeStore::wipe_dir(string mydir) dout(10) << "wiping " << mydir << endl; struct dirent *ent = 0; - while (ent = ::readdir(dir)) { + while ((ent = ::readdir(dir)) != 0) { if (ent->d_name[0] == '.') continue; dout(25) << "mkfs unlinking " << ent->d_name << endl; string fn = mydir + "/" + ent->d_name; @@ -118,10 +107,6 @@ int FakeStore::mkfs() dout(1) << "mkfs in " << mydir << endl; - if (is_shadow) { - dout(1) << "WARNING mkfs reverting to shadow fs, which pbly isn't what MDS expects!" << endl; - } - // make sure my dir exists r = ::stat(mydir.c_str(), &st); if (r != 0) { @@ -176,49 +161,10 @@ int FakeStore::stat(object_t oid, string fn; get_oname(oid,fn); int r = ::stat(fn.c_str(), st); - - if (is_shadow && - r != 0 && // primary didn't exist - ::lstat(fn.c_str(), st) != 0) { // and wasn't an intentionally bad symlink - get_oname(oid,fn,true); - return ::stat(fn.c_str(), st); - } else - return r; + return r; } -void FakeStore::shadow_copy_maybe(object_t oid) { - struct stat st; - string fn; - get_oname(oid, fn); - if (::lstat(fn.c_str(), &st) == 0) - return; // live copy exists, we're fine, do nothing. - - // is there a shadow object? - string sfn; - get_oname(oid, sfn, true); - if (::stat(sfn.c_str(), &st) == 0) { - // shadow exists. copy! - dout(10) << "copying shadow for " << oid << " " << st.st_size << " bytes" << endl; - char *buf = new char[1024*1024]; - int left = st.st_size; - - int sfd = ::open(sfn.c_str(), O_RDONLY); - int fd = ::open(fn.c_str(), O_WRONLY); - assert(sfd && fd); - while (left) { - int howmuch = left; - if (howmuch > 1024*1024) howmuch = 1024*1024; - int got = ::read(sfd, buf, howmuch); - int wrote = ::write(fd, buf, got); - assert(wrote == got); - left -= got; - } - ::close(fd); - ::close(sfd); - } -} - int FakeStore::remove(object_t oid) { @@ -226,18 +172,6 @@ int FakeStore::remove(object_t oid) string fn; get_oname(oid,fn); int r = ::unlink(fn.c_str()); - - if (r == 0 && is_shadow) { - string sfn; - struct stat st; - get_oname(oid, sfn, true); - int s = ::stat(sfn.c_str(), &st); - if (s == 0) { - // shadow exists. make a bad symlink to mask it. - ::symlink(sfn.c_str(), "doesnotexist"); - r = 0; - } - } return r; } @@ -245,11 +179,9 @@ int FakeStore::truncate(object_t oid, off_t size) { dout(20) << "truncate " << oid << " size " << size << endl; - if (is_shadow) shadow_copy_maybe(oid); - string fn; get_oname(oid,fn); - ::truncate(fn.c_str(), size); + return ::truncate(fn.c_str(), size); } int FakeStore::read(object_t oid, @@ -262,17 +194,8 @@ int FakeStore::read(object_t oid, int fd = ::open(fn.c_str(), O_RDONLY); if (fd < 0) { - if (is_shadow) { - struct stat st; - if (::lstat(fn.c_str(), &st) == 0) return fd; // neg symlink - get_oname(oid,fn); - fd = ::open(fn.c_str(), O_RDONLY); - if (fd < 0) - return fd; // no shadow either. - } else { - dout(1) << "read couldn't open " << fn.c_str() << " errno " << errno << " " << strerror(errno) << endl; - return fd; - } + dout(1) << "read couldn't open " << fn.c_str() << " errno " << errno << " " << strerror(errno) << endl; + return fd; } ::flock(fd, LOCK_EX); // lock for safety @@ -292,8 +215,6 @@ int FakeStore::write(object_t oid, bool do_fsync) { dout(20) << "write " << oid << " len " << len << " off " << offset << endl; - if (is_shadow) shadow_copy_maybe(oid); - string fn; get_oname(oid,fn); @@ -326,3 +247,90 @@ int FakeStore::write(object_t oid, return did; } + + + + + + +// ------------------ +// collections + +void FakeStore::get_collfn(coll_t c, string &fn) { + char s[100]; + sprintf(s, "collection.%02llx", c); + fn = basedir; + fn += "/"; + fn += s; +} +void FakeStore::open_collection(coll_t c) { + if (collection_map.count(c) == 0) { + string fn; + get_collfn(c,fn); + collection_map[c] = new BDBMap; + collection_map[c]->open(fn.c_str()); + } +} +int FakeStore::collection_create(coll_t c) { + collections.put(c, 1); + open_collection(c); + return 0; +} +int FakeStore::collection_destroy(coll_t c) { + collections.del(c); + + open_collection(c); + collection_map[c]->close(); + + string fn; + get_collfn(c,fn); + collection_map[c]->remove(fn.c_str()); + delete collection_map[c]; + collection_map.erase(c); + return 0; +} +int FakeStore::collection_add(coll_t c, object_t o) { + open_collection(c); + collection_map[c]->put(o,1); + return 0; +} +int FakeStore::collection_remove(coll_t c, object_t o) { + open_collection(c); + collection_map[c]->del(o); + return 0; +} +int FakeStore::collection_list(coll_t c, list& o) { + open_collection(c); + collection_map[c]->list_keys(o); + return 0; +} + + + +// ------------------ +// attributes + +int FakeStore::setattr(object_t oid, const char *name, + void *value, size_t size) +{ + string fn; + get_oname(oid, fn); + return setxattr(fn.c_str(), name, value, size, 0); +} + + +int FakeStore::getattr(object_t oid, const char *name, + void *value, size_t size) +{ + string fn; + get_oname(oid, fn); + return getxattr(fn.c_str(), name, value, size); +} + +int FakeStore::listattr(object_t oid, char *attrs, size_t size) +{ + string fn; + get_oname(oid, fn); + return listxattr(fn.c_str(), attrs, size); +} + diff --git a/ceph/osd/FakeStore.h b/ceph/osd/FakeStore.h index a5a3c1619d8a1..133e96ef4e0d7 100644 --- a/ceph/osd/FakeStore.h +++ b/ceph/osd/FakeStore.h @@ -2,51 +2,35 @@ #define __FAKESTORE_H #include "ObjectStore.h" +#include "BDBMap.h" + +#include +using namespace std; class FakeStore : public ObjectStore { string basedir; int whoami; + // fns void get_dir(string& dir); - void get_oname(object_t oid, string& fn, bool shadow=false); + void get_oname(object_t oid, string& fn); void wipe_dir(string mydir); - /* shadow: copy-on-write behavior against a "starting" clean object store... - - if (is_shadow == true), - shadowdir has same layout as basedir - - if the (normal, live) object file: - - doesn't exist, then use the shadow file if it exists - - does exist, use the live file (in its entirety, COW is on object granularity) - - is a symlink to a nonexistant file, the object doesn't exist (even if it does in the shadow dir) - - write, truncate initiate a copy from shadow -> live. - unlink may create a bad symlink if the shadow file exists - - etc. - - wipe wipes the live dir, effectively revertiing to the shadow fs, so be careful as - this isn't what a MDS mkfs expects! - */ - string shadowdir; - bool is_shadow; - void shadow_copy_maybe(object_t oid); // do copy-on-write.. called by write(), truncate() public: - FakeStore(char *base, int whoami, char *shadow = 0); + FakeStore(char *base, int whoami); int init(); int finalize(); int mkfs(); - bool exists(object_t oid); - int stat(object_t oid, - struct stat *st); + // ------------------ + // objects + bool exists(object_t oid); + int stat(object_t oid, struct stat *st); int remove(object_t oid); int truncate(object_t oid, off_t size); - int read(object_t oid, size_t len, off_t offset, char *buffer); @@ -54,6 +38,36 @@ class FakeStore : public ObjectStore { size_t len, off_t offset, char *buffer, bool fsync); + + + // ------------------- + // collections + + private: + // collection dbs + BDBMap collections; + map*> collection_map; + + void get_collfn(coll_t c, string &fn); + void open_collection(coll_t c); + + public: + int collection_create(coll_t c); + int collection_destroy(coll_t c); + int collection_add(coll_t c, object_t o); + int collection_remove(coll_t c, object_t o); + int collection_list(coll_t c, list& o); + + + // ------------------- + // attributes + + int setattr(object_t oid, const char *name, + void *value, size_t size); + int getattr(object_t oid, const char *name, + void *value, size_t size); + int listattr(object_t oid, char *attrs, size_t size); + }; #endif diff --git a/ceph/osd/OBFSStore.cc b/ceph/osd/OBFSStore.cc index c3584b3ea3173..0bf07d5775936 100644 --- a/ceph/osd/OBFSStore.cc +++ b/ceph/osd/OBFSStore.cc @@ -19,7 +19,7 @@ extern "C" { #include -#include "include/config.h" +#include "config.h" #undef dout #define dout(l) if (l<=g_conf.debug) cout << "osd" << whoami << ".obfsstore " diff --git a/ceph/osd/OSD.cc b/ceph/osd/OSD.cc index 83c07e0808a4f..eb708244c17c3 100644 --- a/ceph/osd/OSD.cc +++ b/ceph/osd/OSD.cc @@ -36,7 +36,7 @@ #include -#include "include/config.h" +#include "config.h" #undef dout #define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_osd) cout << "osd" << whoami << " " diff --git a/ceph/osd/OSDMap.h b/ceph/osd/OSDMap.h index 7de72611eb413..ecb57463c02c6 100644 --- a/ceph/osd/OSDMap.h +++ b/ceph/osd/OSDMap.h @@ -7,7 +7,7 @@ * whether we're in limbo/recovery state, etc. * */ -#include "include/config.h" +#include "config.h" #include "include/types.h" #include "msg/Message.h" #include "common/Mutex.h" @@ -151,7 +151,7 @@ class OSDCluster { for (vector::iterator it = osd_groups.begin(); it != osd_groups.end(); it++) { - for (int i=0; iosds.size(); i++) + for (unsigned i=0; iosds.size(); i++) ls.insert(it->osds[i]); } } diff --git a/ceph/osd/ObjectStore.h b/ceph/osd/ObjectStore.h index 1344d99df2c11..cffc941951ff3 100644 --- a/ceph/osd/ObjectStore.h +++ b/ceph/osd/ObjectStore.h @@ -3,12 +3,18 @@ #include "include/types.h" +#include +using namespace std; + /* * low-level interface to the local OSD file system */ class ObjectStore { public: + virtual ~ObjectStore() {} + + // mgmt virtual int init() = 0; virtual int finalize() = 0; @@ -29,17 +35,19 @@ class ObjectStore { char *buffer, bool fsync=true) = 0; - /* - // attributes - virtual int setattr(...) = 0; - virtual int getattr(...) = 0; - // collections - virtual int collection_create(coll_t c) = 0; - virtual int collection_destroy(coll_t c) = 0; - virtual int collection_add(coll_t c, object_t o) = 0; - virtual int collection_remove(coll_t c, object_t o) = 0; - */ + virtual int collection_create(coll_t c) {return 0;}//= 0; + virtual int collection_destroy(coll_t c) {return 0;}//= 0; + virtual int collection_add(coll_t c, object_t o) {return 0;}//= 0; + virtual int collection_remove(coll_t c, object_t o) {return 0;}// = 0; + virtual int collection_list(coll_t c, list& o) {return 0;}//= 0; + + // attributes + virtual int setattr(object_t oid, const char *name, + void *value, size_t size) {return 0;} //= 0; + virtual int getattr(object_t oid, const char *name, + void *value, size_t size) {return 0;} //= 0; + virtual int listattr(object_t oid, char *attrs, size_t size) {return 0;} //= 0; }; diff --git a/ceph/osd/rush.cc b/ceph/osd/rush.cc index 574dbae90eb83..deae1e86e54ea 100644 --- a/ceph/osd/rush.cc +++ b/ceph/osd/rush.cc @@ -161,7 +161,7 @@ void RushRNG::DrawKofN (int vals[], int nToDraw, int setSize) { int deck[setSize]; - int i, pick, t; + int i, pick; assert(nToDraw <= setSize); diff --git a/ceph/osdc/Filer.cc b/ceph/osdc/Filer.cc index babebb5b2a096..2073b945556f3 100644 --- a/ceph/osdc/Filer.cc +++ b/ceph/osdc/Filer.cc @@ -15,7 +15,7 @@ #include "include/Context.h" -#include "include/config.h" +#include "config.h" #undef dout #define dout(x) if (x <= g_conf.debug || x <= g_conf.debug_filer) cout << "filer: " @@ -89,11 +89,9 @@ Filer::read(inodeno_t ino, dout(7) << "osd read ino " << ino << " len " << len << " off " << offset << " in " << p->extents.size() << " object extents" << endl; // issue reads - off_t off = 0; for (list::iterator it = p->extents.begin(); it != p->extents.end(); it++) { - int r = 0; // pick a replica last_tid++; // issue read @@ -158,8 +156,8 @@ Filer::handle_osd_read_reply(MOSDOpReply *m) eit != p->extents.end(); eit++) { bufferlist *ox_buf = p->read_data[eit->oid]; - int ox_len = ox_buf->length(); - int ox_off = 0; + unsigned ox_len = ox_buf->length(); + unsigned ox_off = 0; assert(ox_len <= eit->len); // for each buffer extent we're mapping into... @@ -285,7 +283,6 @@ Filer::write(inodeno_t ino, for (list::iterator it = extents.begin(); it != extents.end(); it++) { - int r = 0; // pick a replica last_tid++; // issue write @@ -421,10 +418,10 @@ Filer::handle_osd_op_reply(MOSDOpReply *m) } -int Filer::remove(inodeno_t ino, - OSDFileLayout& layout, - size_t size, - Context *onfinish) +int Filer::truncate(inodeno_t ino, + OSDFileLayout& layout, + size_t new_size, size_t old_size, + Context *onfinish) { // pending write record PendingOSDOp_t *p = new PendingOSDOp_t; @@ -432,23 +429,29 @@ int Filer::remove(inodeno_t ino, // find data list extents; - osdcluster->file_to_extents(ino, layout, size, 0, extents); + osdcluster->file_to_extents(ino, layout, old_size, new_size, extents); - dout(7) << "osd remove ino " << ino << " size " << size << " in " << extents.size() << " extents" << endl; - - size_t off = 0; // ptr into buffer + dout(7) << "osd truncate ino " << ino << " to new size " << new_size << " from old_size " << old_size << " in " << extents.size() << " extents" << endl; int n = 0; for (list::iterator it = extents.begin(); it != extents.end(); it++) { - int r = 0; // pick a replica last_tid++; - // issue delete - MOSDOp *m = new MOSDOp(last_tid, messenger->get_myaddr(), - it->oid, it->rg, osdcluster->get_version(), - OSD_OP_DELETE); + MOSDOp *m; + if (it->offset == 0) { + // issue delete + m = new MOSDOp(last_tid, messenger->get_myaddr(), + it->oid, it->rg, osdcluster->get_version(), + OSD_OP_DELETE); + } else { + // issue a truncate + m = new MOSDOp(last_tid, messenger->get_myaddr(), + it->oid, it->rg, osdcluster->get_version(), + OSD_OP_TRUNCATE); + m->set_length( it->offset ); + } messenger->send_message(m, MSG_ADDR_OSD(it->osd), 0); // add to gather set @@ -464,9 +467,12 @@ int Filer::remove(inodeno_t ino, delete onfinish; } } + + return 0; } +/* int Filer::probe_size(inodeno_t ino, OSDFileLayout& layout, size_t *size, @@ -484,9 +490,9 @@ int Filer::probe_size(inodeno_t ino, // stat first object - + return 0; } - +*/ // mkfs on all osds, wipe everything. @@ -519,6 +525,8 @@ int Filer::mkfs(Context *onfinish) p->outstanding_ops.insert(last_tid); op_mkfs[last_tid] = p; } + + return 0; } diff --git a/ceph/osdc/Filer.h b/ceph/osdc/Filer.h index 38e73b51f1199..ff55a6630724b 100644 --- a/ceph/osdc/Filer.h +++ b/ceph/osdc/Filer.h @@ -109,9 +109,17 @@ class Filer : public Dispatcher { int probe_size(inodeno_t ino, OSDFileLayout& layout, size_t *size, Context *c); - int remove(inodeno_t ino, + + int remove(inodeno_t ino, OSDFileLayout& layout, - size_t size, Context *c); + size_t old_size, + Context *c) { + return truncate(ino, layout, 0, old_size, c); + } + int truncate(inodeno_t ino, + OSDFileLayout& layout, + size_t new_size, size_t old_size, + Context *c); //int zero(inodeno_t ino, size_t len, size_t offset, Context *c); diff --git a/ceph/tcpfuse.cc b/ceph/tcpfuse.cc index 87f0b2e1899e5..a727a2d77c5f0 100644 --- a/ceph/tcpfuse.cc +++ b/ceph/tcpfuse.cc @@ -5,7 +5,7 @@ #include using namespace std; -#include "include/config.h" +#include "config.h" #include "mds/MDCluster.h" #include "mds/MDS.h" diff --git a/ceph/tcpsyn.cc b/ceph/tcpsyn.cc index 4ff13dd14e1d7..92d71295fac26 100644 --- a/ceph/tcpsyn.cc +++ b/ceph/tcpsyn.cc @@ -4,7 +4,7 @@ #include using namespace std; -#include "include/config.h" +#include "config.h" #include "mds/MDCluster.h" #include "mds/MDS.h" diff --git a/ceph/test/fakemds.cc b/ceph/test/fakemds.cc index ebd2b737b99ed..a4daa3f8b8b2f 100644 --- a/ceph/test/fakemds.cc +++ b/ceph/test/fakemds.cc @@ -22,7 +22,7 @@ __uint64_t ino = 1; -#include "include/config.h" +#include "config.h" #define NUMMDS g_conf.num_mds #define NUMOSD g_conf.num_osd #define NUMCLIENT g_conf.num_fakeclient diff --git a/ceph/test/mpitest.cc b/ceph/test/mpitest.cc index 9b5cb232e10e6..14fcadad6eda1 100644 --- a/ceph/test/mpitest.cc +++ b/ceph/test/mpitest.cc @@ -23,7 +23,7 @@ __uint64_t ino = 1; -#include "include/config.h" +#include "config.h" #define NUMMDS g_conf.num_mds #define NUMOSD g_conf.num_osd #define NUMCLIENT g_conf.num_client diff --git a/ceph/test/testmpi.cc b/ceph/test/testmpi.cc index 283efadb751fa..4d2d7c4ddf780 100644 --- a/ceph/test/testmpi.cc +++ b/ceph/test/testmpi.cc @@ -3,7 +3,7 @@ #include using namespace std; -#include "include/config.h" +#include "config.h" #include "messages/MPing.h" #include "common/Mutex.h" -- 2.39.5