From 922c58860165f286c995bcbad1b32361510d7a2d Mon Sep 17 00:00:00 2001 From: sage Date: Tue, 26 Sep 2006 22:22:01 +0000 Subject: [PATCH] client readdir() realted stuff; some mds bug fixes; git-svn-id: https://ceph.svn.sf.net/svnroot/ceph@881 29311d96-e01e-0410-9327-a35deaab8ce9 --- ceph/Makefile | 5 +- ceph/TODO | 46 ++- ceph/client/Client.cc | 447 ++++++++++++++++++++++-------- ceph/client/Client.h | 48 +++- ceph/client/SyntheticClient.cc | 24 +- ceph/client/fuse.cc | 9 +- ceph/client/statlite.h | 21 ++ ceph/common/Clock.h | 2 +- ceph/fakefuse.cc | 21 +- ceph/include/types.h | 52 +++- ceph/mds/CDir.h | 2 +- ceph/mds/Capability.h | 12 +- ceph/mds/Lock.h | 60 ++-- ceph/mds/MDCache.cc | 55 ++-- ceph/mds/MDS.cc | 92 +++--- ceph/mds/MDS.h | 4 +- ceph/messages/MClientReply.h | 182 +++++++----- ceph/messages/MClientRequest.h | 1 + ceph/messages/MHashReaddirReply.h | 42 ++- ceph/osdc/Filer.cc | 10 +- 20 files changed, 774 insertions(+), 361 deletions(-) diff --git a/ceph/Makefile b/ceph/Makefile index 5fd248864fdb8..5365911981140 100644 --- a/ceph/Makefile +++ b/ceph/Makefile @@ -9,7 +9,7 @@ # behave just fine... change ${CC} back to mpicxx if you get paranoid. CC = g++ -CFLAGS = -O4 -Wall -I. -D_FILE_OFFSET_BITS=64 -DMPICH_IGNORE_CXX_SEEK -D_REENTRANT -D_THREAD_SAFE +CFLAGS = -g -Wall -I. -D_FILE_OFFSET_BITS=64 -DMPICH_IGNORE_CXX_SEEK -D_REENTRANT -D_THREAD_SAFE LIBS = -lpthread #for normal mpich2 machines @@ -109,7 +109,7 @@ gprof-helper.so: test/gprof-helper.c # fuse -fakefuse: fakefuse.cc mds.o client.o osd.o ebofs.o client/fuse.o msg/FakeMessenger.cc common.o +fakefuse: fakefuse.cc mon.o mds.o client.o osd.o osdc.o ebofs.o client/fuse.o msg/FakeMessenger.cc common.o ${CC} -pg ${CFLAGS} ${LIBS} -lfuse $^ -o $@ tcpfuse: tcpfuse.cc mds.o client.o client/fuse.o ${TCP_OBJS} common.o @@ -148,7 +148,6 @@ mkfs.ebofs: ebofs/mkfs.ebofs.cc config.cc common/Clock.o ebofs.o test.ebofs: ebofs/test.ebofs.cc config.cc common/Clock.o ebofs.o ${CC} -pg ${CFLAGS} ${LIBS} $^ -o $@ -ebofs: mkfs.ebofs test.ebofs diff --git a/ceph/TODO b/ceph/TODO index 66dadb3f38603..9961a147430e2 100644 --- a/ceph/TODO +++ b/ceph/TODO @@ -1,10 +1,22 @@ + + == todo - how to get usage feedback to monitor? +fixit list +- opendir, readdir, rewinddir, etc. (via getdir) +- statlite +- readdir +- readdirpluslite +- lazio_* +- mds stat vs caps behavior. esp wrt readdirplus, etc. osd/rados +- don't be so pessimistic about discarding write-ahead +- properly handle divergent pg logs and objects +- flag missing log entries on crash recovery - consider implications of nvram writeahead logs - deal with divergent replicas that recover - fix heartbeat wrt new replication @@ -15,7 +27,6 @@ osd/rados - use pg->info.same_role_since wrt replication ops. - report crashed pgs? - messenger - lookup upcall, - distributed namer (eg send to MSG_ADDR_MON_ANY) @@ -32,8 +43,6 @@ monitor - monitor pgs, notify on out - watch osd utilization; adjust overload in cluster map - - objecter objectcacher @@ -43,21 +52,19 @@ objectcacher reliability - heartbeat vs ping -- how to choose peer sets - osdmonitor, filter ebofs -- nonblocking write on missing onodes? -- reallocate if dirty is cancelable. specifically, so that pg log writes are efficient. - fix sync() -- fix bug in node rotation on insert (and reenable) +- clone() - snapshots +- combine inodes and/or cnodes into same blocks - allow btree sets instead of maps +- nonblocking write on missing onodes? - verify LRU behavior sensible: writes go to mid, not top! +- fix bug in node rotation on insert (and reenable) - fix NEAR_LAST_FWD (?) -- combine inodes and/or cnodes into same blocks - journaling? in NVRAM? -- clone() @@ -68,7 +75,6 @@ bugs/stability general - timer needs cancel sets, schedulers need to cancel outstanding events on shutdown - well, just figure out general timer cancellation strategy that avoids races -- gzip in messenger? remaining hard problems @@ -80,13 +86,12 @@ crush - more efficient failure when all/too many osds are down mds -- only share osdmap updates with clients holding capabilities -- statlite -- stat single writer -- truncate() +- efficient stat for single writers - chdir (directory opens!) +- lstat vs stat +- add FILE_CAP_EXTEND capability bit +- only share osdmap updates with clients holding capabilities - delayed replica caps release... we need to set a timer event? (and cancel it when appropriate?) -- review caps logic versus singular - finish hard links! - reclaim danglers from inode file on discover... - fix rename wrt hard links @@ -110,18 +115,7 @@ client - some heuristic behavior to consolidate caps to inode auth - client will re-tx anything it needed to say upon rx of new mds notification (?) -- readv+writev, readx+writex - - serialized! - - - - -cluster issues -- communications failure model.. is it appropriate? - - reliable, ordered, buffered and flushed on 'down' boundaries? - - ordered, unreliable? -- what about large messages? :( diff --git a/ceph/client/Client.cc b/ceph/client/Client.cc index f1bf27d156f90..7b0596c0a6474 100644 --- a/ceph/client/Client.cc +++ b/ceph/client/Client.cc @@ -66,6 +66,7 @@ Client::Client(Messenger *m) mounted = false; unmounting = false; + last_tid = 0; unsafe_sync_write = 0; // @@ -213,22 +214,32 @@ void Client::trim_cache() } } -// insert inode info into metadata cache - -Inode* Client::insert_inode_info(Dir *dir, c_inode_info *in_info) +/** insert_inode + * + * insert + link a single dentry + inode into the metadata cache. + */ +Inode* Client::insert_inode(Dir *dir, InodeStat *st, const string& dname) { - string dname = in_info->ref_dn; Dentry *dn = NULL; if (dir->dentries.count(dname)) dn = dir->dentries[dname]; - dout(12) << "insert_inode_info " << dname << " ino " << hex << in_info->inode.ino << dec << " size " << in_info->inode.size << " hashed " << in_info->hashed << endl; + + dout(12) << "insert_inode " << dname << " ino " << hex << st->inode.ino << dec + << " size " << st->inode.size + << " mtime " << st->inode.mtime + << " hashed " << st->hashed + << endl; if (dn) { - if (dn->inode->inode.ino == in_info->inode.ino) { + if (dn->inode->inode.ino == st->inode.ino) { touch_dn(dn); - dout(12) << " had dentry " << dname << " with correct ino " << hex << dn->inode->inode.ino << dec << endl; + dout(12) << " had dentry " << dname + << " with correct ino " << hex << dn->inode->inode.ino << dec + << endl; } else { - dout(12) << " had dentry " << dname << " with WRONG ino " << hex << dn->inode->inode.ino << dec << endl; + dout(12) << " had dentry " << dname + << " with WRONG ino " << hex << dn->inode->inode.ino << dec + << endl; unlink(dn); dn = NULL; } @@ -236,29 +247,41 @@ Inode* Client::insert_inode_info(Dir *dir, c_inode_info *in_info) if (!dn) { // have inode linked elsewhere? -> unlink and relink! - if (inode_map.count(in_info->inode.ino)) { - Inode *in = inode_map[in_info->inode.ino]; + if (inode_map.count(st->inode.ino)) { + Inode *in = inode_map[st->inode.ino]; assert(in); if (in->dn) { - dout(12) << " had ino " << hex << in->inode.ino << dec << " linked at wrong position, unlinking" << endl; + dout(12) << " had ino " << hex << in->inode.ino << dec + << " linked at wrong position, unlinking" + << endl; dn = relink(in->dn, dir, dname); } else { // link - dout(12) << " had ino " << hex << in->inode.ino << dec << " unlinked, linking" << endl; + dout(12) << " had ino " << hex << in->inode.ino << dec + << " unlinked, linking" << endl; dn = link(dir, dname, in); } } } if (!dn) { - Inode *in = new Inode(in_info->inode, objectcacher); - inode_map[in_info->inode.ino] = in; + Inode *in = new Inode(st->inode, objectcacher); + inode_map[st->inode.ino] = in; dn = link(dir, dname, in); - dout(12) << " new dentry+node with ino " << hex << in_info->inode.ino << dec << endl; + dout(12) << " new dentry+node with ino " << hex << st->inode.ino << dec << endl; } else { // actually update info - dn->inode->inode = in_info->inode; + dout(12) << " stat inode mask is " << st->inode.mask << endl; + dn->inode->inode = st->inode; + + // ...but don't clobber our mtime, size! + if ((dn->inode->inode.mask & INODE_MASK_SIZE) == 0 && + dn->inode->file_wr_size > dn->inode->inode.size) + dn->inode->inode.size = dn->inode->file_wr_size; + if ((dn->inode->inode.mask & INODE_MASK_MTIME) == 0 && + dn->inode->file_wr_mtime > dn->inode->inode.mtime) + dn->inode->inode.mtime = dn->inode->file_wr_mtime; } // OK, we found it! @@ -276,72 +299,79 @@ Inode* Client::insert_inode_info(Dir *dir, c_inode_info *in_info) if ((dn->inode->inode.mode & INODE_TYPE_MASK) == INODE_MODE_SYMLINK) { if (!dn->inode->symlink) dn->inode->symlink = new string; - *(dn->inode->symlink) = in_info->symlink; + *(dn->inode->symlink) = st->symlink; } - // dir info - dn->inode->dir_auth = in_info->dir_auth; - dn->inode->dir_hashed = in_info->hashed; - dn->inode->dir_replicated = in_info->replicated; + return dn->inode; +} +/** update_inode_dist + * + * update MDS location cache for a single inode + */ +void Client::update_inode_dist(Inode *in, InodeStat *st) +{ + // dir info + in->dir_auth = st->dir_auth; + in->dir_hashed = st->hashed; + in->dir_replicated = st->replicated; + // dir replication - if (in_info->spec_defined) { - if (in_info->dist.empty() && !dn->inode->dir_contacts.empty()) - dout(9) << "lost dist spec for " << hex << dn->inode->inode.ino << dec - << " " << in_info->dist << endl; - if (!in_info->dist.empty() && dn->inode->dir_contacts.empty()) - dout(9) << "got dist spec for " << hex << dn->inode->inode.ino << dec - << " " << in_info->dist << endl; - dn->inode->dir_contacts = in_info->dist; + if (st->spec_defined) { + if (st->dist.empty() && !in->dir_contacts.empty()) + dout(9) << "lost dist spec for " << hex << in->inode.ino << dec + << " " << st->dist << endl; + if (!st->dist.empty() && in->dir_contacts.empty()) + dout(9) << "got dist spec for " << hex << in->inode.ino << dec + << " " << st->dist << endl; + in->dir_contacts = st->dist; } - - return dn->inode; } -// insert trace of reply into metadata cache - -void Client::insert_trace(const vector& trace) +/** insert_trace + * + * insert a trace from a MDS reply into the cache. + */ +void Client::insert_trace(MClientReply *reply) { Inode *cur = root; time_t now = time(NULL); - if (trace.empty()) { - return; - } - - for (unsigned i=0; iget_trace_in().size() << " inodes" << endl; + + list::const_iterator pdn = reply->get_trace_dn().begin(); + for (list::const_iterator pin = reply->get_trace_in().begin(); + pin != reply->get_trace_in().end(); + ++pin) { + + if (pin == reply->get_trace_in().begin()) { + // root + dout(10) << "insert_trace root" << endl; if (!root) { - cur = root = new Inode(in_info->inode, objectcacher); + // create + cur = root = new Inode((*pin)->inode, objectcacher); inode_map[root->inode.ino] = root; } - - if (g_conf.client_cache_stat_ttl) - root->valid_until = now + g_conf.client_cache_stat_ttl; - - root->dir_auth = in_info->dir_auth; - assert(root->dir_auth == 0); - root->dir_hashed = in_info->hashed; - root->dir_replicated = in_info->replicated; - if (in_info->spec_defined) - root->dir_contacts = in_info->dist; - - dout(12) << "insert_trace trace " << i << " root .. rep=" << root->dir_replicated << endl; - } else { - dout(12) << "insert_trace trace " << i << endl; + } else { + // not root. + dout(10) << "insert_trace dn " << *pdn << " ino " << hex << (*pin)->inode.ino << dec << endl; Dir *dir = cur->open_dir(); - cur = this->insert_inode_info(dir, trace[i]); - - if (g_conf.client_cache_stat_ttl) - cur->valid_until = now + g_conf.client_cache_stat_ttl; + cur = this->insert_inode(dir, *pin, *pdn); + ++pdn; // move to top of lru! - if (cur->dn) lru.lru_touch(cur->dn); + if (cur->dn) + lru.lru_touch(cur->dn); + } + + // update dist info + update_inode_dist(cur, *pin); - } + // set cache ttl + if (g_conf.client_cache_stat_ttl) + cur->valid_until = now + g_conf.client_cache_stat_ttl; } } @@ -389,6 +419,9 @@ MClientReply *Client::make_request(MClientRequest *req, bool auth_best, int use_mds) // this param is icky, debug weirdness! { + // assign a unique tid + req->set_tid(++last_tid); + // find deepest known prefix Inode *diri = root; // the deepest known containing dir Inode *item = 0; // the actual item... if we know it @@ -475,8 +508,10 @@ MClientReply *Client::make_request(MClientRequest *req, messenger->send_message(req, MSG_ADDR_MDS(mds), MDS_PORT_SERVER); // wait - while (mds_rpc_reply.count(tid) == 0) + while (mds_rpc_reply.count(tid) == 0) { + dout(20) << "make_request awaiting reply kick on " << &cond << endl; cond.Wait(client_lock); + } // got it! reply = mds_rpc_reply[tid]; @@ -484,7 +519,8 @@ MClientReply *Client::make_request(MClientRequest *req, // kick dispatcher (we've got it!) assert(mds_rpc_dispatch_cond.count(tid)); mds_rpc_dispatch_cond[tid]->Signal(); - + dout(20) << "make_request kickback on tid " << tid << " " << mds_rpc_dispatch_cond[tid] << endl; + // clean up. mds_rpc_cond.erase(tid); mds_rpc_reply.erase(tid); @@ -537,14 +573,17 @@ void Client::handle_client_reply(MClientReply *reply) // wake up waiter assert(mds_rpc_cond.count(tid)); + dout(20) << "handle_client_reply kicking caller on " << mds_rpc_cond[tid] << endl; mds_rpc_cond[tid]->Signal(); // wake for kick back assert(mds_rpc_dispatch_cond.count(tid) == 0); Cond cond; mds_rpc_dispatch_cond[tid] = &cond; - while (mds_rpc_cond.count(tid)) + while (mds_rpc_cond.count(tid)) { + dout(20) << "handle_client_reply awaiting kickback on tid " << tid << " " << &cond << endl; cond.Wait(client_lock); + } // ok, clean up! mds_rpc_dispatch_cond.erase(tid); @@ -746,6 +785,7 @@ void Client::handle_file_caps(MClientFileCaps *m) // did file size decrease? if ((old_caps & new_caps & CAP_FILE_RDCACHE) && in->inode.size > m->get_inode().size) { + dout(10) << "**** file size decreased from " << in->inode.size << " to " << m->get_inode().size << " FIXME" << endl; // must have been a truncate() by someone. // trim the buffer cache // ***** fixme write me **** @@ -754,6 +794,13 @@ void Client::handle_file_caps(MClientFileCaps *m) // update inode in->inode = m->get_inode(); // might have updated size... FIXME this is overkill! + // preserve our (possibly newer) file size, mtime + if (in->file_wr_size > in->inode.size) + m->get_inode().size = in->inode.size = in->file_wr_size; + if (in->file_wr_mtime > in->inode.mtime) + m->get_inode().mtime = in->inode.mtime = in->file_wr_mtime; + + if (g_conf.client_oc) { // caching on, use FileCache. Context *onimplement = 0; @@ -989,7 +1036,7 @@ int Client::link(const char *existing, const char *newname) MClientReply *reply = make_request(req, true); int res = reply->get_result(); - this->insert_trace(reply->get_trace()); + insert_trace(reply); delete reply; dout(10) << "link result is " << res << endl; @@ -1032,7 +1079,7 @@ int Client::unlink(const char *relpath) unlink(dn); } } - this->insert_trace(reply->get_trace()); + insert_trace(reply); delete reply; dout(10) << "unlink result is " << res << endl; @@ -1070,7 +1117,7 @@ int Client::rename(const char *relfrom, const char *relto) MClientReply *reply = make_request(req, true); int res = reply->get_result(); - this->insert_trace(reply->get_trace()); + insert_trace(reply); delete reply; dout(10) << "rename result is " << res << endl; @@ -1107,7 +1154,7 @@ int Client::mkdir(const char *relpath, mode_t mode) MClientReply *reply = make_request(req, true); int res = reply->get_result(); - this->insert_trace(reply->get_trace()); + insert_trace(reply); delete reply; dout(10) << "mkdir result is " << res << endl; @@ -1150,7 +1197,7 @@ int Client::rmdir(const char *relpath) unlink(dn); } } - this->insert_trace(reply->get_trace()); + insert_trace(reply); delete reply; dout(10) << "rmdir result is " << res << endl; @@ -1190,7 +1237,7 @@ int Client::symlink(const char *reltarget, const char *rellink) MClientReply *reply = make_request(req, true); int res = reply->get_result(); - this->insert_trace(reply->get_trace()); //FIXME assuming trace of link, not of target + insert_trace(reply); //FIXME assuming trace of link, not of target delete reply; dout(10) << "symlink result is " << res << endl; @@ -1251,17 +1298,19 @@ int Client::lstat(const char *relpath, struct stat *stbuf) // FIXME, PERF request allocation convenient but not necessary for cache hit - MClientRequest *req = new MClientRequest(MDS_OP_STAT, whoami); - req->set_path(path); + MClientRequest *req = 0; + filepath fpath(path); // check whether cache content is fresh enough int res = 0; - Dentry *dn = lookup(req->get_filepath()); + Dentry *dn = lookup(fpath); inode_t inode; time_t now = time(NULL); - if (dn && now <= dn->inode->valid_until) { + if (dn && + now <= dn->inode->valid_until && + ((dn->inode->inode.mask & INODE_MASK_ALL_STAT) == INODE_MASK_ALL_STAT)) { inode = dn->inode->inode; - dout(10) << "lstat cache hit, valid until " << dn->inode->valid_until << endl; + dout(10) << "lstat cache hit w/ sufficient inode.mask, valid until " << dn->inode->valid_until << endl; if (g_conf.client_cache_stat_ttl == 0) dn->inode->valid_until = 0; // only one stat allowed after each readdir @@ -1273,38 +1322,25 @@ int Client::lstat(const char *relpath, struct stat *stbuf) //req->set_caller_uid(fc->uid); //req->set_caller_gid(fc->gid); + req = new MClientRequest(MDS_OP_LSTAT, whoami); + req->set_path(fpath); + MClientReply *reply = make_request(req); res = reply->get_result(); dout(10) << "lstat res is " << res << endl; if (res == 0) { //Transfer information from reply to stbuf - vector trace = reply->get_trace(); - inode = trace[trace.size()-1]->inode; + inode = reply->get_inode(); //Update metadata cache - this->insert_trace(trace); + insert_trace(reply); } delete reply; } if (res == 0) { - memset(stbuf, 0, sizeof(struct stat)); - //stbuf->st_dev = - stbuf->st_ino = inode.ino; - stbuf->st_mode = inode.mode; - stbuf->st_nlink = inode.nlink; - stbuf->st_uid = inode.uid; - stbuf->st_gid = inode.gid; - stbuf->st_ctime = inode.ctime; - stbuf->st_atime = inode.atime; - stbuf->st_mtime = inode.mtime; - stbuf->st_size = inode.size; - stbuf->st_blocks = inode.size ? ((inode.size - 1) / 1024 + 1):0; - stbuf->st_blksize = 1024; - //stbuf->st_flags = - //stbuf->st_gen = - + inode.fill_stat(stbuf); dout(10) << "stat sez size = " << inode.size << " uid = " << inode.uid << " ino = " << hex << stbuf->st_ino << dec << endl; } @@ -1352,7 +1388,7 @@ int Client::chmod(const char *relpath, mode_t mode) MClientReply *reply = make_request(req, true); int res = reply->get_result(); - this->insert_trace(reply->get_trace()); + insert_trace(reply); delete reply; dout(10) << "chmod result is " << res << endl; @@ -1389,7 +1425,7 @@ int Client::chown(const char *relpath, uid_t uid, gid_t gid) MClientReply *reply = make_request(req, true); int res = reply->get_result(); - this->insert_trace(reply->get_trace()); + insert_trace(reply); delete reply; dout(10) << "chown result is " << res << endl; @@ -1427,7 +1463,7 @@ int Client::utime(const char *relpath, struct utimbuf *buf) MClientReply *reply = make_request(req, true); int res = reply->get_result(); - this->insert_trace(reply->get_trace()); + insert_trace(reply); delete reply; dout(10) << "utime result is " << res << endl; @@ -1464,7 +1500,7 @@ int Client::mknod(const char *relpath, mode_t mode) MClientReply *reply = make_request(req, true); int res = reply->get_result(); - this->insert_trace(reply->get_trace()); + insert_trace(reply); dout(10) << "mknod result is " << res << endl; @@ -1485,7 +1521,7 @@ int Client::mknod(const char *relpath, mode_t mode) // fyi: typedef int (*dirfillerfunc_t) (void *handle, const char *name, int type, inodeno_t ino); -int Client::getdir(const char *relpath, map& contents) +int Client::getdir(const char *relpath, map& contents) { client_lock.Lock(); @@ -1509,27 +1545,31 @@ int Client::getdir(const char *relpath, map& contents) MClientReply *reply = make_request(req, true); int res = reply->get_result(); - vector trace = reply->get_trace(); - this->insert_trace(trace); + insert_trace(reply); if (res == 0) { // dir contents to cache! - inodeno_t ino = trace[trace.size()-1]->inode.ino; + inodeno_t ino = reply->get_ino(); Inode *diri = inode_map[ ino ]; assert(diri); assert(diri->inode.mode & INODE_MODE_DIR); - if (reply->get_dir_contents().size()) { + if (!reply->get_dir_in().empty()) { // only open dir if we're actually adding stuff to it! Dir *dir = diri->open_dir(); assert(dir); time_t now = time(NULL); - for (vector::iterator it = reply->get_dir_contents().begin(); - it != reply->get_dir_contents().end(); - it++) { + + list::const_iterator pdn = reply->get_dir_dn().begin(); + for (list::const_iterator pin = reply->get_dir_in().begin(); + pin != reply->get_dir_in().end(); + ++pin, ++pdn) { + // count entries + res++; + // put in cache - Inode *in = this->insert_inode_info(dir, *it); + Inode *in = this->insert_inode(dir, *pin, *pdn); if (g_conf.client_cache_stat_ttl) in->valid_until = now + g_conf.client_cache_stat_ttl; @@ -1537,9 +1577,15 @@ int Client::getdir(const char *relpath, map& contents) in->valid_until = now + g_conf.client_cache_readdir_ttl; // contents to caller too! - contents[(*it)->ref_dn] = &in->inode; + contents[*pdn] = in->inode; } } + + // add .. too? + if (diri != root && diri->dn && diri->dn->dir) { + Inode *parent = diri->dn->dir->parent_inode; + contents[".."] = parent->inode; + } // FIXME: remove items in cache that weren't in my readdir? // *** @@ -1552,6 +1598,182 @@ int Client::getdir(const char *relpath, map& contents) } +/** POSIX stubs **/ + +DIR *Client::opendir(const char *name) +{ + DirResult *d = new DirResult; + d->size = getdir(name, d->contents); + d->p = d->contents.begin(); + d->off = 0; + return (DIR*)d; +} + +int Client::closedir(DIR *dir) +{ + DirResult *d = (DirResult*)dir; + delete d; + return 0; +} + +//struct dirent { +// ino_t d_ino; /* inode number */ +// off_t d_off; /* offset to the next dirent */ +// unsigned short d_reclen; /* length of this record */ +// unsigned char d_type; /* type of file */ +// char d_name[256]; /* filename */ +//}; + +struct dirent *Client::readdir(DIR *dirp) +{ + DirResult *d = (DirResult*)dirp; + + // end of dir? + if (d->p == d->contents.end()) + return 0; + + // fill the dirent + d->dp.d_dirent.d_ino = d->p->second.ino; + if (d->p->second.is_symlink()) + d->dp.d_dirent.d_type = DT_LNK; + else if (d->p->second.is_dir()) + d->dp.d_dirent.d_type = DT_DIR; + else if (d->p->second.is_file()) + d->dp.d_dirent.d_type = DT_REG; + else + d->dp.d_dirent.d_type = DT_UNKNOWN; + strncpy(d->dp.d_dirent.d_name, d->p->first.c_str(), 256); + + d->dp.d_dirent.d_off = d->off; + d->dp.d_dirent.d_reclen = 1; // all records are length 1 (wrt offset, seekdir, telldir, etc.) + + // move up + ++d->off; + ++d->p; + + return &d->dp.d_dirent; +} + +void Client::rewinddir(DIR *dirp) +{ + DirResult *d = (DirResult*)dirp; + d->p = d->contents.begin(); + d->off = 0; +} + +off_t Client::telldir(DIR *dirp) +{ + DirResult *d = (DirResult*)dirp; + return d->off; +} + +void Client::seekdir(DIR *dirp, off_t offset) +{ + DirResult *d = (DirResult*)dirp; + + d->p = d->contents.begin(); + d->off = 0; + + if (offset >= d->size) offset = d->size-1; + while (offset > 0) { + ++d->p; + ++d->off; + --offset; + } +} + +struct dirent_plus *Client::readdirplus(DIR *dirp) +{ + DirResult *d = (DirResult*)dirp; + + // end of dir? + if (d->p == d->contents.end()) + return 0; + + // fill the dirent + d->dp.d_dirent.d_ino = d->p->second.ino; + if (d->p->second.is_symlink()) + d->dp.d_dirent.d_type = DT_LNK; + else if (d->p->second.is_dir()) + d->dp.d_dirent.d_type = DT_DIR; + else if (d->p->second.is_file()) + d->dp.d_dirent.d_type = DT_REG; + else + d->dp.d_dirent.d_type = DT_UNKNOWN; + strncpy(d->dp.d_dirent.d_name, d->p->first.c_str(), 256); + + d->dp.d_dirent.d_off = d->off; + d->dp.d_dirent.d_reclen = 1; // all records are length 1 (wrt offset, seekdir, telldir, etc.) + + // plus + if ((d->p->second.mask & INODE_MASK_ALL_STAT) == INODE_MASK_ALL_STAT) { + // have it + d->p->second.fill_stat(&d->dp.d_stat); + d->dp.d_stat_err = 0; + } else { + // don't have it, stat it + string path = d->path; + path += "/"; + path += d->p->first; + d->dp.d_stat_err = lstat(path.c_str(), &d->dp.d_stat); + } + + // move up + ++d->off; + ++d->p; + + return &d->dp; +} + +/* +struct dirent_lite *Client::readdirlite(DIR *dirp) +{ + DirResult *d = (DirResult*)dirp; + + // end of dir? + if (d->p == d->contents.end()) + return 0; + + // fill the dirent + d->dp.d_dirent.d_ino = d->p->second.ino; + if (d->p->second.is_symlink()) + d->dp.d_dirent.d_type = DT_LNK; + else if (d->p->second.is_dir()) + d->dp.d_dirent.d_type = DT_DIR; + else if (d->p->second.is_file()) + d->dp.d_dirent.d_type = DT_REG; + else + d->dp.d_dirent.d_type = DT_UNKNOWN; + strncpy(d->dp.d_dirent.d_name, d->p->first.c_str(), 256); + + d->dp.d_dirent.d_off = d->off; + d->dp.d_dirent.d_reclen = 1; // all records are length 1 (wrt offset, seekdir, telldir, etc.) + + // plus + if ((d->p->second.mask & INODE_MASK_ALL_STAT) == INODE_MASK_ALL_STAT) { + // have it + d->p->second.fill_stat(d->dp.d_stat); + d->dp.d_stat_err = 0; + } else { + // don't have it, stat it + string path = p->path; + path += "/"; + path += p->first; + d->dp.d_statlite + d->dp.d_stat_err = lstatlite(path.c_str(), &d->dp.d_statlite); + } + + // move up + ++d->off; + ++d->p; + + return &d->dp; +} +*/ + + + + /****** file i/o **********/ @@ -1595,8 +1817,7 @@ int Client::open(const char *relpath, int mode) dout(3) << "op: open_files[" << reply->get_result() << "] = fh; // fh = " << reply->get_result() << endl; tout << reply->get_result() << endl; - vector trace = reply->get_trace(); - this->insert_trace(trace); + insert_trace(reply); int result = reply->get_result(); // success? @@ -1607,7 +1828,7 @@ int Client::open(const char *relpath, int mode) f->mode = cmode; // inode - f->inode = inode_map[trace[trace.size()-1]->inode.ino]; + f->inode = inode_map[reply->get_ino()]; assert(f->inode); f->inode->get(); @@ -2021,7 +2242,7 @@ int Client::truncate(const char *file, off_t size) MClientReply *reply = make_request(req, true); int res = reply->get_result(); - this->insert_trace(reply->get_trace()); + insert_trace(reply); delete reply; dout(10) << " truncate result is " << res << endl; diff --git a/ceph/client/Client.h b/ceph/client/Client.h index 2b98846deb4b2..9c8431e02399e 100644 --- a/ceph/client/Client.h +++ b/ceph/client/Client.h @@ -15,6 +15,10 @@ #ifndef __CLIENT_H #define __CLIENT_H +extern "C" { +#include +} + #include "mds/MDCluster.h" #include "osd/OSDMap.h" @@ -27,6 +31,7 @@ #include "messages/MClientReply.h" //#include "msgthread.h" +#include "statlite.h" #include "include/types.h" #include "include/lru.h" @@ -287,11 +292,27 @@ struct Fh { // client interface class Client : public Dispatcher { + public: + + /* getdir result */ + struct DirResult { + string path; + map contents; + map::iterator p; + int off; + int size; + struct dirent_plus dp; + struct dirent_lite dl; + DirResult() : p(contents.end()), off(-1), size(0) {} + }; + + protected: Messenger *messenger; int whoami; // mds fake RPC + tid_t last_tid; map mds_rpc_cond; map mds_rpc_reply; map mds_rpc_dispatch_cond; @@ -379,7 +400,7 @@ protected: int get_cache_size() { return lru.lru_get_size(); } void set_cache_size(int m) { lru.lru_set_max(m); } - Dentry* link(Dir *dir, string& name, Inode *in) { + Dentry* link(Dir *dir, const string& name, Inode *in) { Dentry *dn = new Dentry; dn->name = name; @@ -415,7 +436,7 @@ protected: delete dn; } - Dentry *relink(Dentry *dn, Dir *dir, string& name) { + Dentry *relink(Dentry *dn, Dir *dir, const string& name) { // first link new dn to dir /* char *oldname = (char*)dn->name; @@ -480,9 +501,9 @@ protected: void close_safe(Inode *in); // metadata cache - Inode* insert_inode_info(Dir *dir, c_inode_info *in_info); - void insert_trace(const vector& trace); - + Inode* insert_inode(Dir *dir, InodeStat *in_info, const string& dn); + void update_inode_dist(Inode *in, InodeStat *st); + void insert_trace(MClientReply *reply); // ---------------------- // fs ops. @@ -496,7 +517,22 @@ protected: int chdir(const char *s); // namespace ops - int getdir(const char *path, map& contents); + int getdir(const char *path, list& contents); + int getdir(const char *path, map& contents); + + DIR *opendir(const char *name); + int closedir(DIR *dir); + struct dirent *readdir(DIR *dir); + void rewinddir(DIR *dir); + off_t telldir(DIR *dir); + void seekdir(DIR *dir, off_t offset); + + struct dirent_plus *readdirplus(DIR *dirp); + int readdirplus_r(DIR *dirp, struct dirent_plus *entry, struct dirent_plus **result); + struct dirent_lite *readdirlite(DIR *dirp); + int readdirlite_r(DIR *dirp, struct dirent_lite *entry, struct dirent_lite **result); + + int link(const char *existing, const char *newname); int unlink(const char *path); int rename(const char *from, const char *to); diff --git a/ceph/client/SyntheticClient.cc b/ceph/client/SyntheticClient.cc index 0b60f35c5d421..206c94dbb058e 100644 --- a/ceph/client/SyntheticClient.cc +++ b/ceph/client/SyntheticClient.cc @@ -610,7 +610,7 @@ int SyntheticClient::play_trace(Trace& t, string& prefix) client->mknod(a, b); } else if (strcmp(op, "getdir") == 0) { const char *a = t.get_string(p); - map contents; + map contents; client->getdir(a, contents); } else if (strcmp(op, "open") == 0) { const char *a = t.get_string(p); @@ -665,14 +665,14 @@ int SyntheticClient::play_trace(Trace& t, string& prefix) int SyntheticClient::clean_dir(string& basedir) { // read dir - map contents; + map contents; int r = client->getdir(basedir.c_str(), contents); if (r < 0) { dout(1) << "readdir on " << basedir << " returns " << r << endl; return r; } - for (map::iterator it = contents.begin(); + for (map::iterator it = contents.begin(); it != contents.end(); it++) { string file = basedir + "/" + it->first; @@ -686,7 +686,7 @@ int SyntheticClient::clean_dir(string& basedir) continue; } - if (st.st_mode & INODE_MODE_DIR) { + if ((st.st_mode & INODE_TYPE_MASK) == INODE_MODE_DIR) { clean_dir(file); client->rmdir(file.c_str()); } else { @@ -704,14 +704,14 @@ int SyntheticClient::full_walk(string& basedir) if (time_to_stop()) return -1; // read dir - map contents; + map contents; int r = client->getdir(basedir.c_str(), contents); if (r < 0) { dout(1) << "readdir on " << basedir << " returns " << r << endl; return r; } - for (map::iterator it = contents.begin(); + for (map::iterator it = contents.begin(); it != contents.end(); it++) { string file = basedir + "/" + it->first; @@ -723,7 +723,7 @@ int SyntheticClient::full_walk(string& basedir) continue; } - if (st.st_mode & INODE_MODE_DIR) full_walk(file); + if ((st.st_mode & INODE_TYPE_MASK) == INODE_MODE_DIR) full_walk(file); } return 0; @@ -797,7 +797,7 @@ int SyntheticClient::read_dirs(const char *basedir, int dirs, int files, int dep char d[500]; dout(3) << "read_dirs " << basedir << " dirs " << dirs << " files " << files << " depth " << depth << endl; - map contents; + map contents; utime_t s = g_clock.now(); int r = client->getdir(basedir, contents); utime_t e = g_clock.now(); @@ -1165,15 +1165,15 @@ int SyntheticClient::random_walk(int num_req) if (op == MDS_OP_READDIR) { clear_dir(); - map c; + map c; r = client->getdir( cwd.c_str(), c ); - for (map::iterator it = c.begin(); + for (map::iterator it = c.begin(); it != c.end(); it++) { //dout(DBL) << " got " << it->first << endl; - contents[it->first] = *(it->second); - if (it->second->mode & INODE_MODE_DIR) + contents[it->first] = it->second; + if (it->second.is_dir()) subdirs.insert(it->first); } diff --git a/ceph/client/fuse.cc b/ceph/client/fuse.cc index 71aeae211c92d..0f7b89f03ff4a 100644 --- a/ceph/client/fuse.cc +++ b/ceph/client/fuse.cc @@ -79,19 +79,20 @@ static int ceph_readlink(const char *path, char *buf, size_t size) static int ceph_getdir(const char *path, fuse_dirh_t h, fuse_dirfil_t filler) { - map contents; + map contents; int res = client->getdir(path, contents); if (res < 0) return res; // return contents to fuse via callback - for (map::iterator it = contents.begin(); + for (map::iterator it = contents.begin(); it != contents.end(); it++) { + // (immutable) inode contents too. res = filler(h, // fuse's handle it->first.c_str(), // dentry as char* - it->second->mode & INODE_TYPE_MASK, // mask type bits from mode - it->second->ino); // ino.. 64->32 bit issue here? FIXME + it->second.mode & INODE_TYPE_MASK, // mask type bits from mode + it->second.ino); // ino.. 64->32 bit issue here? FIXME if (res != 0) break; // fuse has had enough } return res; diff --git a/ceph/client/statlite.h b/ceph/client/statlite.h index 5c3d33de24830..396ff432d15c5 100644 --- a/ceph/client/statlite.h +++ b/ceph/client/statlite.h @@ -1,10 +1,13 @@ #ifndef _STATLITE_H #define _STATLITE_H +extern "C" { + #include #include #include #include +#include struct statlite { dev_t st_dev; /* device */ @@ -20,6 +23,9 @@ struct statlite { off_t st_size; /* total size, in bytes */ blksize_t st_blksize; /* blocksize for filesystem I/O */ blkcnt_t st_blocks; /* number of blocks allocated */ + struct timespec st_atim; /* Time of last access. */ + struct timespec st_mtim; /* Time of last modification. */ + struct timespec st_ctim; /* Time of last status change. */ //time_t st_atime; /* time of last access */ //time_t st_mtime; /* time of last modification */ //time_t st_ctime; /* time of last change */ @@ -39,4 +45,19 @@ struct statlite { #define S_ISVALIDMTIME(m) (m & S_REQUIREMTIME) #define S_ISVALIDCTIME(m) (m & S_REQUIRECTIME) + +// readdirplus etc. + +struct dirent_plus { + struct dirent d_dirent; /* dirent struct for this entry */ + struct stat d_stat; /* attributes for this entry */ + int d_stat_err;/* errno for d_stat, or 0 */ +}; +struct dirent_lite { + struct dirent d_dirent; /* dirent struct for this entry */ + struct statlite d_stat; /* attributes for this entry */ + int d_stat_err;/* errno for d_stat, or 0 */ +}; + +} #endif diff --git a/ceph/common/Clock.h b/ceph/common/Clock.h index 1fed020eddfa4..5ab16a9686d69 100644 --- a/ceph/common/Clock.h +++ b/ceph/common/Clock.h @@ -169,7 +169,7 @@ class Clock { // absolute time time_t gettime() { - return now().sec(); + return real_now().sec(); } }; diff --git a/ceph/fakefuse.cc b/ceph/fakefuse.cc index b2968b87e99ba..72b8b34debd9d 100644 --- a/ceph/fakefuse.cc +++ b/ceph/fakefuse.cc @@ -8,6 +8,8 @@ using namespace std; #include "config.h" #include "mds/MDCluster.h" +#include "mon/Monitor.h" + #include "mds/MDS.h" #include "osd/OSD.h" #include "client/Client.h" @@ -63,21 +65,36 @@ int main(int argc, char **argv) { args = nargs; vec_to_argv(args, argc, argv); + Monitor *mon[g_conf.num_mon]; + for (int i=0; iinit(); } // create mds MDS *mds[NUMMDS]; for (int i=0; iinit(); } + // init + for (int i=0; iinit(); + } + for (int i=0; iinit(); + } + for (int i=0; iinit(); + } + + // create client Client *client[NUMCLIENT]; for (int i=0; i +#include #include +} #include #include @@ -189,14 +192,25 @@ typedef __uint64_t version_t; #define FILE_MODE_W 2 #define FILE_MODE_RW 3 +#define INODE_MASK_BASE 1 // ino, ctime, nlink +#define INODE_MASK_PERM 2 // uid, gid, mode +#define INODE_MASK_SIZE 4 // size, blksize, blocks +#define INODE_MASK_MTIME 8 // mtime +#define INODE_MASK_ATIME 16 // atime + +#define INODE_MASK_ALL_STAT (INODE_MASK_BASE|INODE_MASK_PERM|INODE_MASK_SIZE|INODE_MASK_MTIME) +//#define INODE_MASK_ALL_STAT (INODE_MASK_BASE|INODE_MASK_PERM|INODE_MASK_SIZE|INODE_MASK_MTIME|INODE_MASK_ATIME) + struct inode_t { - // immutable + // base (immutable) inodeno_t ino; // NOTE: ino _must_ come first for MDStore.cc to behave!! time_t ctime; + // other FileLayout layout; // ?immutable? + int nlink; // base, - // hard (namespace permissions) + // hard/perm (namespace permissions) mode_t mode; uid_t uid; gid_t gid; @@ -205,13 +219,31 @@ struct inode_t { off_t size; time_t atime, mtime; // maybe atime different? "lazy"? - // other - int nlink; + int mask; // special stuff unsigned char hash_seed; // only defined for dir; 0 if not hashed. bool anchored; // auth only version_t file_data_version; // auth only + + bool is_symlink() { return (mode & INODE_TYPE_MASK) == INODE_MODE_SYMLINK; } + bool is_dir() { return (mode & INODE_TYPE_MASK) == INODE_MODE_DIR; } + bool is_file() { return (mode & INODE_TYPE_MASK) == INODE_MODE_FILE; } + + void fill_stat(struct stat *st) { + memset(st, 0, sizeof(struct stat)); + st->st_ino = ino; + st->st_mode = mode; + st->st_nlink = nlink; + st->st_uid = uid; + st->st_gid = gid; + st->st_ctime = ctime; + st->st_atime = atime; + st->st_mtime = mtime; + st->st_size = size; + st->st_blocks = size ? ((size - 1) / 4096 + 1):0; + st->st_blksize = 4096; + } }; @@ -267,7 +299,14 @@ inline ostream& operator<<(ostream& out, const eversion_t e) { #ifdef OBJECT128 -typedef lame128_t object_t; +//typedef lame128_t object_t; +struct object_t { + inodeno_t ino; // 64 bits + __uint32_t bno; // 32 bits + + snapv_t snap_last; // 16 bits + snapv_t snap_first; // 16 bits +}; #else typedef __uint64_t object_t; // object id #endif @@ -275,6 +314,9 @@ typedef __uint64_t object_t; // object id #define PG_NONE 0xffffffffffffffffLL +typedef __uint16_t snapv_t; // snapshot version + + class OSDSuperblock { public: const static __uint64_t MAGIC = 0xeb0f505dULL; diff --git a/ceph/mds/CDir.h b/ceph/mds/CDir.h index 5aed7e608bf54..e38b81bccfd80 100644 --- a/ceph/mds/CDir.h +++ b/ceph/mds/CDir.h @@ -227,7 +227,7 @@ class CDir { set hashed_subset; // HASHING: subset of mds's that are hashed public: // for class MDS - map > hashed_readdir; + map, list > > hashed_readdir; protected: // context diff --git a/ceph/mds/Capability.h b/ceph/mds/Capability.h index 0701a282137ba..49032048eec19 100644 --- a/ceph/mds/Capability.h +++ b/ceph/mds/Capability.h @@ -24,11 +24,12 @@ using namespace std; // definite caps -#define CAP_FILE_RDCACHE 1 -#define CAP_FILE_RD 2 -#define CAP_FILE_WR 4 -#define CAP_FILE_WRBUFFER 8 -//#define CAP_INODE_STAT 16 +#define CAP_FILE_RDCACHE 1 // client can safely cache reads +#define CAP_FILE_RD 2 // client can read +#define CAP_FILE_WR 4 // client can write +#define CAP_FILE_WREXTEND 8 // client can extend file +#define CAP_FILE_WRBUFFER 16 // client can safely buffer writes + // heuristics //#define CAP_FILE_DELAYFLUSH 32 @@ -41,6 +42,7 @@ inline string cap_string(int cap) if (cap & CAP_FILE_RD) s += " rd"; if (cap & CAP_FILE_WR) s += " wr"; if (cap & CAP_FILE_WRBUFFER) s += " wrbuffer"; + if (cap & CAP_FILE_WRBUFFER) s += " wrextend"; s += " ]"; return s; } diff --git a/ceph/mds/Lock.h b/ceph/mds/Lock.h index 85b319770c249..c614f08a570fb 100644 --- a/ceph/mds/Lock.h +++ b/ceph/mds/Lock.h @@ -24,31 +24,49 @@ using namespace std; #include "Capability.h" // states and such. -// C = cache reads, R = read, W = write, B = buffer writes +// C = cache reads, R = read, W = write, A = append, B = buffer writes -// basic lock -----auth---- ---replica--- -#define LOCK_SYNC 0 // AR R . / C R . . R . / C R . . stat() -#define LOCK_LOCK 1 // AR R W / C . . . . . / C . . . truncate() -#define LOCK_GLOCKR 2 // AR R . / C . . . . . / C . . . +// basic lock -----auth------ ---replica----- +#define LOCK_SYNC 0 // AR R . / C R . . . R . / C R . . stat() +#define LOCK_LOCK 1 // AR R W / C . . . . . . / C . . . truncate() +#define LOCK_GLOCKR 2 // AR R . / C . . . . . . / C . . . // file lock states -#define LOCK_GLOCKL 3 // A . . / . . . . -#define LOCK_GLOCKM 4 // A . . / . . . . -#define LOCK_MIXED 5 // AR . . / . R W . . . / . R . . -#define LOCK_GMIXEDR 6 // AR R . / . R . . . . / . R . . -#define LOCK_GMIXEDL 7 // A . . / . . . . +#define LOCK_GLOCKL 3 // A . . / . . . . . +#define LOCK_GLOCKM 4 // A . . / . . . . . +#define LOCK_MIXED 5 // AR . . / . R W A . . . / . R . . +#define LOCK_GMIXEDR 6 // AR R . / . R . . . . . / . R . . +#define LOCK_GMIXEDL 7 // A . . / . . . . . -#define LOCK_LONER 8 // A . . / C R W B (lock) -#define LOCK_GLONERR 9 // A . . / . R . . -#define LOCK_GLONERM 10 // A . . / . R W . +#define LOCK_LONER 8 // A . . / C R W A B (lock) +#define LOCK_GLONERR 9 // A . . / . R . . . +#define LOCK_GLONERM 10 // A . . / . R W A . -#define LOCK_GSYNCL 11 // A . . / . . . . -#define LOCK_GSYNCM 12 // A . . / . R . . +#define LOCK_GSYNCL 11 // A . . / . . . . . +#define LOCK_GSYNCM 12 // A . . / . R . . . // 4 stable // +9 transition // 13 total +/* no append scenarios: + +loner + truncate(): + - loner needs to lose A (?unless it's the loner doing the truncate?) +loner + statlite(size): + - loner needs to lose A + +any + statlite(size) + - all lose A + +any + statlite(mtime) + - all lose W + + +-> we need to add lonerfixed and mixedfixed states (and associated transitions) + in order to efficiently support statlite(size) and truncate(). + + */ // -- lock... hard or file @@ -193,7 +211,7 @@ class CLock { // client caps allowed int caps_allowed_ever(bool auth) { if (auth) - return CAP_FILE_RDCACHE | CAP_FILE_RD | CAP_FILE_WR | CAP_FILE_WRBUFFER; + return CAP_FILE_RDCACHE | CAP_FILE_RD | CAP_FILE_WR | CAP_FILE_WREXTEND | CAP_FILE_WRBUFFER; else return CAP_FILE_RDCACHE | CAP_FILE_RD; } @@ -211,18 +229,18 @@ class CLock { return 0; case LOCK_MIXED: - return CAP_FILE_RD | CAP_FILE_WR; + return CAP_FILE_RD | CAP_FILE_WR | CAP_FILE_WREXTEND; case LOCK_GMIXEDR: return CAP_FILE_RD; case LOCK_GMIXEDL: return 0; case LOCK_LONER: // single client writer, of course. - return CAP_FILE_WR | CAP_FILE_WRBUFFER | CAP_FILE_RD | CAP_FILE_RDCACHE; + return CAP_FILE_RDCACHE | CAP_FILE_RD | CAP_FILE_WR | CAP_FILE_WREXTEND | CAP_FILE_WRBUFFER; case LOCK_GLONERR: - return CAP_FILE_WR; + return CAP_FILE_RD; case LOCK_GLONERM: - return CAP_FILE_RD | CAP_FILE_WR; + return CAP_FILE_RD | CAP_FILE_WR | CAP_FILE_WREXTEND; case LOCK_GSYNCL: return 0; @@ -257,7 +275,7 @@ class CLock { case LOCK_GLONERM: case LOCK_GLONERR: case LOCK_LONER: - return CAP_FILE_RDCACHE | CAP_FILE_RD | CAP_FILE_WR | CAP_FILE_WRBUFFER; + return CAP_FILE_RDCACHE | CAP_FILE_RD | CAP_FILE_WR | CAP_FILE_WREXTEND | CAP_FILE_WRBUFFER; case LOCK_LOCK: case LOCK_GLOCKR: case LOCK_GLOCKL: diff --git a/ceph/mds/MDCache.cc b/ceph/mds/MDCache.cc index ba35287d47a90..f7341e596652e 100644 --- a/ceph/mds/MDCache.cc +++ b/ceph/mds/MDCache.cc @@ -771,7 +771,7 @@ bool MDCache::shutdown_pass() // imports? if (!imports.empty()) { dout(7) << "still have " << imports.size() << " imports" << endl; - //show_cache(); + show_cache(); return false; } @@ -781,7 +781,7 @@ bool MDCache::shutdown_pass() show_cache(); //dump(); return false; - } + } // done! dout(1) << "shutdown done, sending shutdown_finish" << endl; @@ -816,7 +816,8 @@ int MDCache::open_root(Context *c) // make it up (FIXME) root->inode.mode = 0755 | INODE_MODE_DIR; root->inode.size = 0; - root->inode.mtime = 0; + root->inode.ctime = 0; + root->inode.mtime = g_clock.gettime(); root->inode.nlink = 1; root->inode.layout = g_OSD_MDDirLayout; @@ -2755,36 +2756,36 @@ void MDCache::dentry_unlink_finish(CDentry *dn, CDir *dir, Context *c) void MDCache::handle_dentry_unlink(MDentryUnlink *m) { CInode *diri = get_inode(m->get_dirino()); - CDir *dir; + CDir *dir = 0; if (diri) dir = diri->dir; + if (!diri || !dir) { dout(7) << "handle_dentry_unlink don't have dir " << hex << m->get_dirino() << dec << endl; - delete m; - return; } - - CDentry *dn = dir->lookup(m->get_dn()); - if (!dn) { - dout(7) << "handle_dentry_unlink don't have dentry " << *dir << " dn " << m->get_dn() << endl; - } else { - dout(7) << "handle_dentry_unlink on " << *dn << endl; - - // dir? - if (dn->inode) { - if (dn->inode->dir) { - dn->inode->dir->state_set(CDIR_STATE_DELETED); - dn->inode->dir->remove_null_dentries(); + else { + CDentry *dn = dir->lookup(m->get_dn()); + if (!dn) { + dout(7) << "handle_dentry_unlink don't have dentry " << *dir << " dn " << m->get_dn() << endl; + } else { + dout(7) << "handle_dentry_unlink on " << *dn << endl; + + // dir? + if (dn->inode) { + if (dn->inode->dir) { + dn->inode->dir->state_set(CDIR_STATE_DELETED); + dn->inode->dir->remove_null_dentries(); + } } + + string dname = dn->name; + + // unlink + dn->dir->remove_dentry(dn); + + // wake up + //dir->finish_waiting(CDIR_WAIT_DNREAD, dname); + dir->take_waiting(CDIR_WAIT_DNREAD, dname, mds->finished_queue); } - - string dname = dn->name; - - // unlink - dn->dir->remove_dentry(dn); - - // wake up - //dir->finish_waiting(CDIR_WAIT_DNREAD, dname); - dir->take_waiting(CDIR_WAIT_DNREAD, dname, mds->finished_queue); } delete m; diff --git a/ceph/mds/MDS.cc b/ceph/mds/MDS.cc index 1b79ba9835bf9..d38a9eedfe035 100644 --- a/ceph/mds/MDS.cc +++ b/ceph/mds/MDS.cc @@ -1206,7 +1206,9 @@ bool MDS::try_open_dir(CInode *in, MClientRequest *req) // READDIR -int MDS::encode_dir_contents(CDir *dir, list& items) +int MDS::encode_dir_contents(CDir *dir, + list& inls, + list& dnls) { int numfiles = 0; @@ -1223,7 +1225,8 @@ int MDS::encode_dir_contents(CDir *dir, list& items) // is dentry readable? if (dn->is_xlocked()) { // ***** FIXME ***** - dout(10) << "warning, returning xlocked dentry, we are technically WRONG" << endl; + // ? + dout(10) << "warning, returning xlocked dentry, we _may_ be fudging on POSIX consistency" << endl; } CInode *in = dn->inode; @@ -1232,8 +1235,9 @@ int MDS::encode_dir_contents(CDir *dir, list& items) dout(12) << "including inode " << *in << endl; // add this item - // note: c_inode_info makes note of whether inode data is readable. - items.push_back( new c_inode_info(in, whoami, it->first) ); + // note: InodeStat makes note of whether inode data is readable. + dnls.push_back( it->first ); + inls.push_back( new InodeStat(in, whoami) ); numfiles++; } return numfiles; @@ -1264,14 +1268,15 @@ void MDS::handle_hash_readdir(MHashReaddir *m) dout(10) << " incomplete dir contents for readdir on " << *dir << ", fetching" << endl; mdstore->fetch_dir(dir, new C_MDS_RetryMessage(this, m)); return; - } +} // get content - list items; - encode_dir_contents(dir, items); + list inls; + list dnls; + int num = encode_dir_contents(dir, inls, dnls); // sent it back! - messenger->send_message(new MHashReaddirReply(dir->ino(), items), + messenger->send_message(new MHashReaddirReply(dir->ino(), inls, dnls, num), m->get_source(), MDS_PORT_CACHE, MDS_PORT_CACHE); } @@ -1295,8 +1300,10 @@ void MDS::handle_hash_readdir_reply(MHashReaddirReply *m) // move items to hashed_readdir gather int from = MSG_ADDR_NUM(m->get_source()); assert(dir->hashed_readdir.count(from) == 0); - dir->hashed_readdir[from].splice(dir->hashed_readdir[from].begin(), - m->get_items()); + dir->hashed_readdir[from].first.splice(dir->hashed_readdir[from].first.begin(), + m->get_in()); + dir->hashed_readdir[from].second.splice(dir->hashed_readdir[from].second.begin(), + m->get_dn()); delete m; // gather finished? @@ -1313,11 +1320,11 @@ void MDS::handle_hash_readdir_reply(MHashReaddirReply *m) finish_contexts(finished); // now discard these results - for (map >::iterator it = dir->hashed_readdir.begin(); + for (map, list > >::iterator it = dir->hashed_readdir.begin(); it != dir->hashed_readdir.end(); it++) { - for (list::iterator ci = it->second.begin(); - ci != it->second.end(); + for (list::iterator ci = it->second.first.begin(); + ci != it->second.first.end(); ci++) delete *ci; } @@ -1358,7 +1365,8 @@ void MDS::finish_hash_readdir(MClientRequest *req, CDir *dir) reply->set_result(0); for (int i=0; iget_num_mds(); i++) { - reply->copy_dir_items(dir->hashed_readdir[i]); + reply->copy_dir_items(dir->hashed_readdir[i].first, + dir->hashed_readdir[i].second); } // ok! @@ -1441,7 +1449,9 @@ void MDS::handle_client_readdir(MClientRequest *req, dir->auth_pin(); // get local bits - encode_dir_contents(cur->dir, dir->hashed_readdir[whoami]); + encode_dir_contents(cur->dir, + dir->hashed_readdir[whoami].first, + dir->hashed_readdir[whoami].second); // request other bits for (int i=0; iget_num_mds(); i++) { @@ -1453,25 +1463,30 @@ void MDS::handle_client_readdir(MClientRequest *req, // wait dir->add_waiter(CDIR_WAIT_THISHASHEDREADDIR, new C_MDS_HashReaddir(this, req, dir)); - return; + } else { + // NON-HASHED + // build dir contents + list inls; + list dnls; + int numfiles = encode_dir_contents(cur->dir, inls, dnls); + + // . too + dnls.push_back("."); + inls.push_back(new InodeStat(cur, whoami)); + ++numfiles; + + // yay, reply + MClientReply *reply = new MClientReply(req); + reply->take_dir_items(inls, dnls, numfiles); + + dout(10) << "reply to " << *req << " readdir " << numfiles << " files" << endl; + reply->set_result(0); + + //balancer->hit_dir(cur->dir); + + // reply + reply_request(req, reply, cur); } - - // NON-HASHED - // build dir contents - list items; - int numfiles = encode_dir_contents(cur->dir, items); - - // yay, reply - MClientReply *reply = new MClientReply(req); - reply->take_dir_items(items); - - dout(10) << "reply to " << *req << " readdir " << numfiles << " files" << endl; - reply->set_result(0); - - //balancer->hit_dir(cur->dir); - - // reply - reply_request(req, reply, cur); } @@ -2553,15 +2568,10 @@ void MDS::handle_client_mkdir(MClientRequest *req, CInode *diri) } } - // commit + // commit to log commit_request(req, new MClientReply(req, 0), diri, - new EInodeUpdate(newi)); - //, new EDirUpdate(newdir)); - - // schedule a commit for good measure - // NOTE: not strictly necessary.. it's in the log! - // but, if fakemds crashes we'll be less likely to corrupt osddata/* (in leiu of a real recovery mechanism) - //mdstore->commit_dir(newdir, NULL); + new EInodeUpdate(newi),//); + new EDirUpdate(newdir)); // FIXME: weird performance regression here w/ double log; somewhat of a mystery! return; } diff --git a/ceph/mds/MDS.h b/ceph/mds/MDS.h index c051236867641..01a71aa38c2ca 100644 --- a/ceph/mds/MDS.h +++ b/ceph/mds/MDS.h @@ -245,7 +245,9 @@ public: // readdir void handle_client_readdir(MClientRequest *req, CInode *ref); - int encode_dir_contents(CDir *dir, list& items); + int encode_dir_contents(CDir *dir, + list& inls, + list& dnls); void handle_hash_readdir(MHashReaddir *m); void handle_hash_readdir_reply(MHashReaddirReply *m); void finish_hash_readdir(MClientRequest *req, CDir *dir); diff --git a/ceph/messages/MClientReply.h b/ceph/messages/MClientReply.h index d12f2c41d8ddd..0e56280215178 100644 --- a/ceph/messages/MClientReply.h +++ b/ceph/messages/MClientReply.h @@ -47,36 +47,35 @@ class CInode; * */ -class c_inode_info { +class InodeStat { + public: inode_t inode; - string ref_dn; // referring dentry (blank if root) - string symlink; // symlink content (if symlink) + string symlink; // symlink content (if symlink) - bool inode_file_valid; // true if inode info is valid (ie was readable on mds at the time) - bool inode_hard_valid; // true if inode info is valid (ie was readable on mds at the time) + // mds distribution hints int dir_auth; bool hashed, replicated; - bool spec_defined; set dist; // where am i replicated? - public: - c_inode_info() {} - c_inode_info(CInode *in, int whoami, string ref_dn) { - // inode - this->inode = in->inode; - this->inode_file_valid = in->filelock.can_read(in->is_auth()); - this->inode_hard_valid = in->hardlock.can_read(in->is_auth()); + InodeStat() {} + InodeStat(CInode *in, int whoami) : + inode(in->inode) + { + // inode.mask + inode.mask = INODE_MASK_BASE; + if (in->filelock.can_read(in->is_auth())) + inode.mask |= INODE_MASK_PERM; + if (in->hardlock.can_read(in->is_auth())) + inode.mask |= INODE_MASK_SIZE | INODE_MASK_MTIME; // fixme when we separate this out. // symlink content? - if (in->is_symlink()) this->symlink = in->symlink; + if (in->is_symlink()) + symlink = in->symlink; - // referring dentry? - this->ref_dn = ref_dn; - // replicated where? if (in->dir && in->dir->is_auth()) { spec_defined = true; @@ -96,14 +95,11 @@ class c_inode_info { void _encode(bufferlist &bl) { bl.append((char*)&inode, sizeof(inode)); - bl.append((char*)&inode_file_valid, sizeof(inode_file_valid)); - bl.append((char*)&inode_hard_valid, sizeof(inode_hard_valid)); bl.append((char*)&spec_defined, sizeof(spec_defined)); bl.append((char*)&dir_auth, sizeof(dir_auth)); bl.append((char*)&hashed, sizeof(hashed)); bl.append((char*)&replicated, sizeof(replicated)); - ::_encode(ref_dn, bl); ::_encode(symlink, bl); ::_encode(dist, bl); // distn } @@ -111,10 +107,6 @@ class c_inode_info { void _decode(bufferlist &bl, int& off) { bl.copy(off, sizeof(inode), (char*)&inode); off += sizeof(inode); - bl.copy(off, sizeof(inode_file_valid), (char*)&inode_file_valid); - off += sizeof(inode_file_valid); - bl.copy(off, sizeof(inode_hard_valid), (char*)&inode_hard_valid); - off += sizeof(inode_hard_valid); bl.copy(off, sizeof(spec_defined), (char*)&spec_defined); off += sizeof(spec_defined); bl.copy(off, sizeof(dir_auth), (char*)&dir_auth); @@ -124,7 +116,6 @@ class c_inode_info { bl.copy(off, sizeof(replicated), (char*)&replicated); off += sizeof(replicated); - ::_decode(ref_dn, bl, off); ::_decode(symlink, bl, off); ::_decode(dist, bl, off); } @@ -136,11 +127,12 @@ typedef struct { long tid; int op; int result; // error code - int trace_depth; - int dir_size; unsigned char file_caps; // for open long file_caps_seq; __uint64_t file_data_version; // for client buffercache consistency + + int _num_trace_in; + int _dir_size; } MClientReply_st; class MClientReply : public Message { @@ -148,8 +140,11 @@ class MClientReply : public Message { MClientReply_st st; string path; - vector trace; - vector dir_contents; + list trace_in; + list trace_dn; + + list dir_in; + list dir_dn; public: void set_pcid(long pcid) { this->st.pcid = pcid; } @@ -157,11 +152,19 @@ class MClientReply : public Message { long get_tid() { return st.tid; } int get_op() { return st.op; } - inodeno_t get_ino() { return trace[trace.size()-1]->inode.ino; } + int get_result() { return st.result; } const string& get_path() { return path; } - const vector& get_trace() { return trace; } - vector& get_dir_contents() { return dir_contents; } + + inodeno_t get_ino() { return trace_in.back()->inode.ino; } + const inode_t& get_inode() { return trace_in.back()->inode; } + + const list& get_trace_in() { return trace_in; } + const list& get_trace_dn() { return trace_dn; } + + const list& get_dir_in() { return dir_in; } + const list& get_dir_dn() { return dir_dn; } + unsigned char get_file_caps() { return st.file_caps; } long get_file_caps_seq() { return st.file_caps_seq; } __uint64_t get_file_data_version() { return st.file_data_version; } @@ -181,16 +184,16 @@ class MClientReply : public Message { this->path = req->get_path(); this->st.result = result; - st.trace_depth = 0; - st.dir_size = 0; + + st._dir_size = 0; + st._num_trace_in = 0; } virtual ~MClientReply() { - vector::iterator it; + list::iterator it; - for (it = trace.begin(); it != trace.end(); it++) + for (it = trace_in.begin(); it != trace_in.end(); ++it) delete *it; - - for (it = dir_contents.begin(); it != dir_contents.end(); it++) + for (it = dir_in.begin(); it != dir_in.end(); ++it) delete *it; } virtual char *get_type_name() { return "creply"; } @@ -204,66 +207,91 @@ class MClientReply : public Message { _decode(path, payload, off); - for (int i=0; i_decode(payload, off); - trace.push_back(ci); + trace_in.push_back(ci); } - if (st.dir_size) { - for (int i=0; i_decode(payload, off); - dir_contents.push_back(ci); - } + for (int i=0; i_decode(payload, off); + dir_in.push_back(ci); + string dn; + ::_decode(dn, payload, off); + dir_dn.push_back(dn); } } virtual void encode_payload() { - st.dir_size = dir_contents.size(); - st.trace_depth = trace.size(); - payload.append((char*)&st, sizeof(st)); _encode(path, payload); - vector::iterator it; - for (it = trace.begin(); it != trace.end(); it++) - (*it)->_encode(payload); + // trace + list::iterator pdn = trace_dn.begin(); + list::iterator pin; + for (pin = trace_in.begin(); + pin != trace_in.end(); + ++pin) { + if (pin != trace_in.begin()) { + ::_encode(*pdn, payload); + ++pdn; + } + (*pin)->_encode(payload); + } - for (it = dir_contents.begin(); it != dir_contents.end(); it++) - (*it)->_encode(payload); + // dir contents + pdn = dir_dn.begin(); + for (pin = dir_in.begin(); + pin != dir_in.end(); + ++pin, ++pdn) { + (*pin)->_encode(payload); + ::_encode(*pdn, payload); + } } // builders - void add_dir_item(c_inode_info *c) { - dir_contents.push_back(c); + /* + void add_dir_item(string& dn, InodeStat *in) { + dir_dn.push_back(dn); + dir_in.push_back(in); + ++st._dir_size; + }*/ + void take_dir_items(list& inls, + list& dnls, + int num) { + dir_in.swap(inls); + dir_dn.swap(dnls); + st._dir_size = num; } - void take_dir_items(list& l) { - for (list::iterator it = l.begin(); - it != l.end(); - it++) { - dir_contents.push_back(*it); - } - l.clear(); - } - void copy_dir_items(list& l) { - for (list::iterator it = l.begin(); - it != l.end(); - it++) { + void copy_dir_items(const list& inls, + const list& dnls) { + list::const_iterator pdn = dnls.begin(); + list::const_iterator pin = inls.begin(); + while (pin != inls.end()) { // copy! - c_inode_info *i = new c_inode_info; - *i = **it; - dir_contents.push_back(i); + InodeStat *i = new InodeStat; + *i = **pin; + dir_in.push_back(i); + dir_dn.push_back(*pdn); + ++pin; + ++pdn; + ++st._dir_size; } } void set_trace_dist(CInode *in, int whoami) { + st._num_trace_in = 0; while (in) { // add this inode to trace, along with referring dentry name - string ref_dn; - CDentry *dn = in->get_parent_dn(); - if (dn) ref_dn = dn->get_name(); - - trace.insert(trace.begin(), new c_inode_info(in, whoami, ref_dn)); + if (in->get_parent_dn()) + trace_dn.push_front(in->get_parent_dn()->get_name()); + trace_in.push_front(new InodeStat(in, whoami)); + ++st._num_trace_in; in = in->get_parent_inode(); } diff --git a/ceph/messages/MClientRequest.h b/ceph/messages/MClientRequest.h index 66b9e2fcaf638..e8175fd0c4014 100644 --- a/ceph/messages/MClientRequest.h +++ b/ceph/messages/MClientRequest.h @@ -90,6 +90,7 @@ class MClientRequest : public Message { void set_tid(long t) { st.tid = t; } void set_path(string& p) { path.set_path(p); } void set_path(const char *p) { path.set_path(p); } + void set_path(const filepath& fp) { path = fp; } void set_caller_uid(int u) { st.caller_uid = u; } void set_caller_gid(int g) { st.caller_gid = g; } void set_ino(inodeno_t ino) { st.ino = ino; } diff --git a/ceph/messages/MHashReaddirReply.h b/ceph/messages/MHashReaddirReply.h index a4ba1582d69ac..0d4cd7a18f02c 100644 --- a/ceph/messages/MHashReaddirReply.h +++ b/ceph/messages/MHashReaddirReply.h @@ -19,23 +19,29 @@ class MHashReaddirReply : public Message { inodeno_t ino; - list dir_contents; + + list dir_in; + list dir_dn; + int num; + public: MHashReaddirReply() { } - MHashReaddirReply(inodeno_t ino, list& ls) : - Message(MSG_MDS_HASHREADDIRREPLY) { - this->ino = ino; - dir_contents.splice(dir_contents.begin(), ls); + MHashReaddirReply(inodeno_t _ino, list& inls, list& dnls, int n) : + Message(MSG_MDS_HASHREADDIRREPLY), + ino(_ino), + num(n) { + dir_in.swap(inls); + dir_dn.swap(dnls); } ~MHashReaddirReply() { - list::iterator it; - for (it = dir_contents.begin(); it != dir_contents.end(); it++) + for (list::iterator it = dir_in.begin(); it != dir_in.end(); it++) delete *it; } inodeno_t get_ino() { return ino; } - list& get_items() { return dir_contents; } + list& get_in() { return dir_in; } + list& get_dn() { return dir_dn; } virtual char *get_type_name() { return "Hls"; } @@ -47,18 +53,26 @@ class MHashReaddirReply : public Message { payload.copy(n, sizeof(n), (char*)&n); off += sizeof(n); for (int i=0; i_decode(payload, off); - dir_contents.push_back(ci); + dir_in.push_back(ci); } } virtual void encode_payload() { payload.append((char*)&ino, sizeof(ino)); - int n = dir_contents.size(); + int n = dir_in.size(); // FIXME? payload.append((char*)&n, sizeof(n)); - list::iterator it; - for (it = dir_contents.begin(); it != dir_contents.end(); it++) - (*it)->_encode(payload); + list::iterator pdn = dir_dn.begin(); + for (list::iterator pin = dir_in.begin(); + pin != dir_in.end(); + ++pin, ++pdn) { + ::_encode(*pdn, payload); + (*pin)->_encode(payload); + } } }; diff --git a/ceph/osdc/Filer.cc b/ceph/osdc/Filer.cc index cf8170a6350f2..b19600d7ea114 100644 --- a/ceph/osdc/Filer.cc +++ b/ceph/osdc/Filer.cc @@ -31,13 +31,17 @@ #include "config.h" #undef dout -#define dout(x) if (x <= g_conf.debug || x <= g_conf.debug_filer) cout << messenger->get_myaddr() << ".filer " +#define dout(x) if (x <= g_conf.debug || x <= g_conf.debug_filer) cout << "filer " void Filer::file_to_extents(inode_t inode, off_t offset, size_t len, list& extents) { + dout(10) << "file_to_extents " << offset << "~" << len + << " on " << hex << inode.ino << dec + << endl; + /* we want only one extent per object! * this means that each extent we read may map into different bits of the * final read buffer.. hence OSDExtent.buffer_extents @@ -46,7 +50,8 @@ void Filer::file_to_extents(inode_t inode, assert(inode.layout.object_size >= inode.layout.stripe_size); off_t stripes_per_object = inode.layout.object_size / inode.layout.stripe_size; - + dout(20) << " stripes_per_object " << stripes_per_object << endl; + off_t cur = offset; off_t left = len; while (left > 0) { @@ -93,6 +98,7 @@ void Filer::file_to_extents(inode_t inode, } ex->buffer_extents[cur-offset] = x_len; + dout(15) << "file_to_extents " << ex << endl; //cout << "map: ino " << ino << " oid " << ex.oid << " osd " << ex.osd << " offset " << ex.offset << " len " << ex.len << " ... left " << left << endl; left -= x_len; -- 2.39.5