From cd0d8ec53cedfd71d09ba8a25b3c0637eee11064 Mon Sep 17 00:00:00 2001 From: sage Date: Wed, 6 Oct 2004 21:49:20 +0000 Subject: [PATCH] "progress" git-svn-id: https://ceph.svn.sf.net/svnroot/ceph@110 29311d96-e01e-0410-9327-a35deaab8ce9 --- ceph/TODO | 7 + ceph/include/filepath.h | 41 +- ceph/mds/CDir.cc | 20 +- ceph/mds/CDir.h | 64 ++- ceph/mds/CInode.cc | 34 +- ceph/mds/CInode.h | 62 ++- ceph/mds/MDBalancer.cc | 6 +- ceph/mds/MDCache.cc | 867 ++++++++++++++++----------------- ceph/mds/MDCache.h | 25 +- ceph/mds/MDS.cc | 43 +- ceph/mds/MDStore.cc | 128 ++--- ceph/mds/MDStore.h | 12 +- ceph/mds/events/EInodeUnlink.h | 22 +- ceph/mds/events/EInodeUpdate.h | 6 +- ceph/messages/MClientRequest.h | 11 +- ceph/messages/MDiscover.h | 2 +- ceph/messages/MDiscoverReply.h | 50 +- ceph/msg/Message.h | 1 + ceph/msg/Messenger.cc | 11 +- 19 files changed, 781 insertions(+), 631 deletions(-) diff --git a/ceph/TODO b/ceph/TODO index d247aa5d0ce19..99b9954df00e7 100644 --- a/ceph/TODO +++ b/ceph/TODO @@ -7,6 +7,13 @@ BUGS? ISSUES + +- discover + - soft + - make sure waiters are properly triggered, either upon dir_rep update, or (empty!) discover reply + - error flag + + - log - overall logging strategy. - currently (?): do everything immediately, reply when logged. diff --git a/ceph/include/filepath.h b/ceph/include/filepath.h index 55e49d0ae6a1d..f6d71be719f7a 100644 --- a/ceph/include/filepath.h +++ b/ceph/include/filepath.h @@ -2,6 +2,7 @@ #define __FILEPATH_H #include +#include using namespace std; class filepath { @@ -29,14 +30,43 @@ class filepath { public: filepath() {} filepath(string& s) { + set_path(s); + } + filepath(char* s) { + set_path(s); + } + + void set_path(string& s) { path = s; parse(); } - filepath(char* s) { + void set_path(const char *s) { path = s; parse(); } + string& get_path() { + return path; + } + const char *c_str() { + return path.c_str(); + } + + + filepath subpath(int s) { + filepath t; + for (int i=s; i::iterator it = bits.begin(); it != bits.end(); it++) { - r.append(*it); + r.append((*it).c_str()); r.append((char)0); } return r; } - int _unrope(crope& r, int off = 0) { + int _unrope(crope r, int off = 0) { clear(); char n; r.copy(off, sizeof(char), (char*)&n); @@ -82,7 +109,7 @@ class filepath { for (int i=0; iname.length(); if (nitems == 1) - inode->get(CINODE_PIN_CHILD); // pin parent + get(CDIR_PIN_CHILD); // pin parent } void CDir::remove_child(CDentry *d) { @@ -118,7 +118,7 @@ void CDir::remove_child(CDentry *d) { //namesize -= d->name.length(); if (nitems == 0) - inode->put(CINODE_PIN_CHILD); // release parent. + put(CDIR_PIN_CHILD); // release parent. } @@ -211,7 +211,7 @@ void CDir::add_waiter(int tag, const string& dentry, Context *c) { if (waiting_on_dentry.size() == 0) - inode->get(CINODE_PIN_DIRWAITDN); + get(CDIR_PIN_WAITER); waiting_on_dentry[ dentry ].insert(pair(tag,c)); dout(10) << "add_waiter dentry " << dentry << " tag " << tag << " " << c << " on " << *this << endl; } @@ -231,7 +231,7 @@ void CDir::add_waiter(int tag, Context *c) { // this dir. if (waiting.empty()) - inode->get(CINODE_PIN_DIRWAIT); + get(CDIR_PIN_WAITER); waiting.insert(pair(tag,c)); dout(10) << "add_waiter " << tag << " " << c << " on " << *this << endl; } @@ -261,7 +261,7 @@ void CDir::take_waiting(int mask, // ...whole map? if (waiting_on_dentry.size() == 0) - inode->put(CINODE_PIN_DIRWAITDN); + put(CDIR_PIN_WAITER); } /* NOTE: this checks dentry waiters too */ @@ -292,7 +292,7 @@ void CDir::take_waiting(int mask, } if (waiting.empty()) - inode->put(CINODE_PIN_DIRWAIT); + put(CDIR_PIN_WAITER); } } @@ -342,7 +342,7 @@ void CDir::mark_clean() // ref counts -void CDir::get(int by) { +void CDir::put(int by) { // bad? if (ref == 0 || ref_set.count(by) != 1) { dout(7) << *this << " bad put by " << by << " was " << ref << " (" << ref_set << ")" << endl; @@ -374,7 +374,7 @@ void CDir::get(int by) { ref++; ref_set.insert(by); - dout(7) << *this " get by " << by << " now " << ref << " (" << ref_set << ")" << endl; + dout(7) << *this << " get by " << by << " now " << ref << " (" << ref_set << ")" << endl; } @@ -439,7 +439,7 @@ void CDir::auth_pin() { void CDir::auth_unpin() { auth_pins--; if (auth_pins == 0) - put(CINODE_PIN_DAUTHPIN); + put(CDIR_PIN_AUTHPIN); assert(auth_pins >= 0); dout(7) << "auth_unpin on " << *this << " count now " << auth_pins << " + " << nested_auth_pins << endl; @@ -714,5 +714,5 @@ void CDir::dump_to_disk(MDS *mds) } dout(10) << "dump2disk: writing dir " << inode->inode.ino << endl; - mds->mdstore->commit_dir(inode, NULL); + mds->mdstore->commit_dir(inode->dir, NULL); } diff --git a/ceph/mds/CDir.h b/ceph/mds/CDir.h index a73b371f01645..150b63ad0acda 100644 --- a/ceph/mds/CDir.h +++ b/ceph/mds/CDir.h @@ -93,7 +93,7 @@ class Context; #define CDIR_PIN_AUTHPIN 8 #define CDIR_NUM_PINS 9 -static char cdir_pin_names[CDIR_NUM_PINS] = { +static char* cdir_pin_names[CDIR_NUM_PINS] = { "child", "opened", "hashed", @@ -210,7 +210,7 @@ class CDir { friend class CDirExport; public: - CDir(CInode *in, MDS *mds); + CDir(CInode *in, MDS *mds, bool auth=false); @@ -269,7 +269,20 @@ class CDir { int get_replica_nonce() { assert(!is_auth()); return replica_nonce; } - + int open_by_add(int mds) { + if (is_open_by(mds)) { // already had it? + // new nonce (+1) + map::iterator it = open_by_nonce.find(mds); + open_by_nonce.insert(pair(mds,it->second + 1)); + return it->second + 1; + } + if (open_by.empty()) + get(CDIR_PIN_OPENED); + open_by.insert(mds); + open_by_nonce.insert(pair(mds,1)); // first! serial of 1. + return 1; // default nonce + } + @@ -419,14 +432,30 @@ class CDirDiscover { inodeno_t ino; int nonce; int dir_auth; + int dir_rep; set rep_by; + public: + CDirDiscover(); CDirDiscover(CDir *dir, int nonce) { ino = dir->ino(); this->nonce = nonce; dir_auth = dir->dir_auth; + dir_rep = dir->dir_rep; rep_by = dir->dir_rep_by; } + + void update_dir(CDir *dir) { + assert(dir->ino() == ino); + assert(!dir->is_auth()); + + dir->replica_nonce = nonce; + dir->dir_auth = dir_auth; + dir->dir_rep = dir_rep; + dir->dir_rep_by = rep_by; + } + + crope _rope() { crope r; @@ -434,6 +463,7 @@ class CDirDiscover { r.append((char*)&ino, sizeof(ino)); r.append((char*)&nonce, sizeof(nonce)); r.append((char*)&dir_auth, sizeof(dir_auth)); + r.append((char*)&dir_rep, sizeof(dir_rep)); int nrep_by = rep_by.size(); r.append((char*)&nrep_by, sizeof(nrep_by)); @@ -456,6 +486,8 @@ class CDirDiscover { off += sizeof(nonce); s.copy(off, sizeof(dir_auth), (char*)&dir_auth); off += sizeof(dir_auth); + s.copy(off, sizeof(dir_rep), (char*)&dir_rep); + off += sizeof(dir_rep); int nrep_by; s.copy(off, sizeof(int), (char*)&nrep_by); @@ -491,24 +523,45 @@ typedef struct { } CDirExport_st; class CDirExport { - CDirExport_st st; + set open_by; map open_by_nonce; set rep_by; + public: CDirExport(CDir *dir) { st.ino = dir->ino(); st.nitems = dir->nitems; st.version = dir->version; st.state = dir->state; - st.popularity = dir->popularity; + st.popularity = dir->popularity[0]; // FIXME FIXME st.dir_auth = dir->dir_auth; st.dir_rep = dir->dir_rep; rep_by = dir->dir_rep_by; + open_by = dir->open_by; open_by_nonce = dir->open_by_nonce; } + inodeno_t get_ino() { return st.ino; } + + void update_dir(CDir *dir) { + assert(dir->ino() == st.ino); + + dir->nitems = st.nitems; + dir->version = st.version; + dir->state = (dir->state & CDIR_MASK_STATE_IMPORT_KEPT) | // remember import flag, etc. + (st.state & CDIR_MASK_STATE_EXPORTED); + dir->popularity[0] = st.popularity; + dir->dir_auth = st.dir_auth; + dir->dir_rep = st.dir_rep; + + dir->dir_rep_by = rep_by; + dir->open_by = open_by; + dir->open_by_nonce = open_by_nonce; + } + + crope _rope() { crope r; @@ -548,6 +601,7 @@ class CDirExport { off += sizeof(int); s.copy(off, sizeof(int), (char*)&n); off += sizeof(int); + open_by.insert(m); open_by_nonce.insert(pair(m,n)); } diff --git a/ceph/mds/CInode.cc b/ceph/mds/CInode.cc index 6b6a8042e801d..f5322841fff89 100644 --- a/ceph/mds/CInode.cc +++ b/ceph/mds/CInode.cc @@ -18,15 +18,14 @@ // ====== CInode ======= -CInode::CInode() : LRUObject() { +CInode::CInode(bool auth) : LRUObject() { ref = 0; parent = NULL; nparents = 0; lru_next = lru_prev = NULL; - dir_auth = CDIR_AUTH_PARENT; - dir = NULL; // create CDir as needed + dir = NULL; // CDir opened separately auth_pins = 0; nested_auth_pins = 0; @@ -39,7 +38,7 @@ CInode::CInode() : LRUObject() { version = 0; - auth = true; // by default. + this->auth = auth; // by default. } CInode::~CInode() { @@ -59,6 +58,13 @@ CInode *CInode::get_parent_inode() return NULL; } +bool CInode::dir_is_auth() { + if (dir) + return dir->is_auth(); + else + return is_auth(); +} + CDir *CInode::get_or_open_dir(MDS *mds) { assert(is_dir()); @@ -67,7 +73,15 @@ CDir *CInode::get_or_open_dir(MDS *mds) // only auth can open dir alone. assert(is_auth()); - dir = new CDir(this, mds); + set_dir( new CDir(this, mds) ); + return dir; +} + +CDir *CInode::set_dir(CDir *newdir) +{ + assert(dir == 0); + get(CINODE_PIN_DIR); + dir = newdir; return dir; } @@ -119,7 +133,7 @@ ostream& operator<<(ostream& out, CInode& in) for(set::iterator it = in.get_ref_set().begin(); it != in.get_ref_set().end(); it++) - if (*it < CINODE_PIN_NUM) + if (*it < CINODE_NUM_PINS) out << " " << cinode_pin_names[*it]; else out << " " << *it; @@ -209,10 +223,10 @@ crope CInode::encode_export_state() istate.dirty = true; else istate.dirty = false; - if (is_dir()) - istate.dir_auth = dir_auth; - else - istate.dir_auth = -1; + //if (is_dir()) + //istate.dir_auth = dir_auth; + //else + // istate.dir_auth = -1; // append to rope r.append( (char*)&istate, sizeof(istate) ); diff --git a/ceph/mds/CInode.h b/ceph/mds/CInode.h index 025d82a524ab7..ecd3038be612e 100644 --- a/ceph/mds/CInode.h +++ b/ceph/mds/CInode.h @@ -109,6 +109,10 @@ static char *cinode_pin_names[CINODE_NUM_PINS] = { // waiters: inode_unlink // triggers: inode_unlink_finish +#define CINODE_WAIT_DIR 8192 + // waiters: traverse_path + // triggers: handle_disocver_reply + #define CINODE_WAIT_ANY 0xffff @@ -206,9 +210,11 @@ class CInode : LRUObject { friend class MDCache; friend class CDir; + friend class CInodeExport; + friend class CInodeDiscover; public: - CInode(); + CInode(bool auth=true); ~CInode(); @@ -228,18 +234,14 @@ class CInode : LRUObject { CInode *get_parent_inode(); CInode *get_realm_root(); // import, hash, or root - CDir *get_or_open_dir(); + CDir *get_or_open_dir(MDS *mds); + CDir *set_dir(CDir *newdir); bool dir_is_hashed() { if (inode.isdir == INODE_DIR_HASHED) return true; return false; } - bool dir_is_auth() { - if (dir) - return dir->is_auth(); - else - return is_auth(); - } + bool dir_is_auth(); float get_popularity() { return popularity[0].get(); @@ -491,6 +493,7 @@ class CInodeDiscover { bool is_softasync; bool is_lockbyauth; + public: CInodeDiscover() {} CInodeDiscover(CInode *in, int nonce) { inode = in->inode; @@ -499,6 +502,20 @@ class CInodeDiscover { is_softasync = in->is_softasync(); is_lockbyauth = in->is_lockbyauth() || in->is_prelock(); } + + inodeno_t get_ino() { return inode.ino; } + + int update_inode(CInode *in) { + in->inode = inode; + + in->set_auth(false); + assert(!in->is_auth()); + in->replica_nonce = replica_nonce; + + if (is_syncbyauth) in->dist_state |= CINODE_DIST_SYNCBYAUTH; + if (is_softasync) in->dist_state |= CINODE_DIST_SOFTASYNC; + if (is_lockbyauth) in->dist_state |= CINODE_DIST_LOCKBYAUTH; + } crope _rope() { crope r; @@ -533,27 +550,46 @@ typedef struct { inode_t inode; __uint64_t version; DecayCounter popularity; - bool dirty; // dirty inode? + bool is_dirty; // dirty inode? bool is_softasync; int ncached_by; // int pairs follow } CInodeExport_st; + class CInodeExport { CInodeExport_st st; set cached_by; map cached_by_nonce; +public: CInodeExport() {} CInodeExport(CInode *in) { st.inode = in->inode; st.version = in->get_version(); st.popularity = in->get_popularity(); - st.dirty = in->is_dirty(); + st.is_dirty = in->is_dirty(); st.is_softasync = in->is_softasync(); - cached_by = in->get_cached_by(); - cached_by_nonce = in->get_cached_by_nonce(); + cached_by = in->cached_by; + cached_by_nonce = in->cached_by_nonce; + } + + int update_inode(CInode *in) { + in->inode = st.inode; + + in->version = st.version; + in->popularity[0] = st.popularity; + + if (st.is_dirty) + in->mark_dirty(); + + if (st.is_softasync) + in->dist_state |= CINODE_DIST_SOFTASYNC; + + in->cached_by.clear(); + in->cached_by = cached_by; + in->cached_by_nonce = cached_by_nonce; } crope _rope() { @@ -577,7 +613,7 @@ class CInodeExport { int _unrope(crope s, int off = 0) { s.copy(off, sizeof(st), (char*)&st); off += sizeof(st); - + for (int i=0; imdcache->imports.end(); it++) { import_pop_map.insert(pair((*it)->get_popularity(), *it)); - int from = (*it)->inode->authority(mds->get_cluster()); + int from = (*it)->inode->authority(); dout(5) << "map i imported " << **it << " from " << from << endl; import_from_map.insert(pair(from, *it)); } @@ -238,7 +238,7 @@ void MDBalancer::do_rebalance() if (dir->inode->is_root()) continue; if (dir->is_freezing() || dir->is_frozen()) continue; // export pbly already in progress double pop = dir->get_popularity(); - assert(dir->inode->authority(mds->get_cluster()) == target); // cuz that's how i put it in the map, dummy + assert(dir->inode->authority() == target); // cuz that's how i put it in the map, dummy if (pop <= amount) { dout(5) << "reexporting " << *dir << " pop " << pop << " back to " << target << endl; @@ -266,7 +266,7 @@ void MDBalancer::do_rebalance() pop < MIN_REEXPORT) { dout(5) << "reexporting " << *imp << " pop " << pop << endl; amount -= pop; - mds->mdcache->export_dir(imp, imp->inode->authority(mds->get_cluster())); + mds->mdcache->export_dir(imp, imp->inode->authority()); } if (amount < MIN_OFFLOAD) break; } diff --git a/ceph/mds/MDCache.cc b/ceph/mds/MDCache.cc index 251aed64e9542..09369b27ceddc 100644 --- a/ceph/mds/MDCache.cc +++ b/ceph/mds/MDCache.cc @@ -17,6 +17,7 @@ #include "events/EInodeUnlink.h" #include "messages/MDiscover.h" +#include "messages/MDiscoverReply.h" #include "messages/MInodeGetReplica.h" #include "messages/MInodeGetReplicaAck.h" @@ -36,6 +37,7 @@ #include "messages/MDirUpdate.h" #include "messages/MInodeExpire.h" +#include "messages/MDirExpire.h" #include "messages/MInodeUnlink.h" #include "messages/MInodeUnlinkAck.h" @@ -73,7 +75,6 @@ MDCache::MDCache(MDS *m) { mds = m; root = NULL; - opening_root = false; lru = new LRU(); lru->lru_set_max(g_conf.mdcache_size); lru->lru_set_midpoint(g_conf.mdcache_mid); @@ -150,7 +151,7 @@ void MDCache::unlink_inode(CInode *in) dn->dir->adjust_nested_auth_pins( 0 - (in->auth_pins + in->nested_auth_pins) ); // explicitly define auth - in->dangling_auth = in->authority(mds->get_cluster()); + in->dangling_auth = in->authority(); dout(10) << "unlink_inode " << *in << " dangling_auth now " << in->dangling_auth << endl; // detach @@ -180,8 +181,19 @@ bool MDCache::trim(__int32_t max) { CInode *in = (CInode*)lru->lru_expire(); if (!in) return false; - // notify authority? - int auth = in->authority(mds->get_cluster()); + if (in->dir) { + // notify dir authority? + int auth = in->dir->authority(); + if (auth != mds->get_nodeid()) { + dout(7) << "sending dir_expire to mds" << auth << " on " << *in->dir << endl; + mds->messenger->send_message(new MDirExpire(in->ino(), mds->get_nodeid(), in->dir->replica_nonce), + MSG_ADDR_MDS(auth), MDS_PORT_CACHE, + MDS_PORT_CACHE); + } + } + + // notify inode authority? + int auth = in->authority(); if (auth != mds->get_nodeid()) { dout(7) << "sending inode_expire to mds" << auth << " on " << *in << endl; mds->messenger->send_message(new MInodeExpire(in->ino(), mds->get_nodeid(), in->replica_nonce), @@ -213,7 +225,7 @@ bool MDCache::trim(__int32_t max) { !idir->is_root() && // not root !(idir->dir->is_freezing() || idir->dir->is_frozen()) // FIXME: can_auth_pin? ) { - int dest = idir->authority(mds->get_cluster()); + int dest = idir->authority(); // comment this out ot wreak havoc? if (mds->is_shutting_down()) dest = 0; // this is more efficient. @@ -321,7 +333,7 @@ bool MDCache::shutdown_pass() // un-import imports.erase(root->dir); root->dir->state_clear(CDIR_STATE_IMPORT); - root->put(CINODE_PIN_IMPORT); + root->dir->put(CDIR_PIN_IMPORT); assert(root->ref == 0); @@ -363,23 +375,19 @@ bool MDCache::shutdown_pass() -int MDCache::link_inode( CInode *parent, string& dname, CInode *in ) +int MDCache::link_inode( CDir *dir, string& dname, CInode *in ) { - if (!parent->dir) { - return -ENOTDIR; // not a dir - } - - assert(parent->dir->lookup(dname) == 0); + assert(dir->lookup(dname) == 0); // create dentry CDentry* dn = new CDentry(dname, in); in->add_parent(dn); // add to dir - parent->dir->add_child(dn); + dir->add_child(dn); // set dir version - in->parent_dir_version = parent->dir->get_version(); + in->parent_dir_version = dir->get_version(); return 0; } @@ -393,7 +401,7 @@ int MDCache::open_root(Context *c) // open root inode if (whoami == 0) { - // i am root + // i am root inode CInode *root = new CInode(); root->inode.ino = 1; root->inode.isdir = true; @@ -403,19 +411,19 @@ int MDCache::open_root(Context *c) root->inode.size = 0; root->inode.touched = 0; - root->dir = new CDir(root, whoami); - assert(root->dir->is_auth()); - root->dir_auth = 0; // me! - root->dir->dir_rep = CDIR_REP_NONE; - root->state_set(CINODE_STATE_ROOT); set_root( root ); - // root is technically an import (from a vacuum) + // root directory too + root->set_dir( new CDir(root, mds, true) ); + root->dir->dir_auth = 0; // me! + root->dir->dir_rep = CDIR_REP_NONE; + + // root is sort of technically an import (from a vacuum) imports.insert( root->dir ); root->dir->state_set(CDIR_STATE_IMPORT); - root->get(CINODE_PIN_IMPORT); + root->dir->get(CDIR_PIN_IMPORT); if (c) { c->finish(0); @@ -423,22 +431,23 @@ int MDCache::open_root(Context *c) } } else { // request inode from root mds - if (c) - waiting_for_root.push_back(c); - - if (!opening_root) { + if (waiting_for_root.empty()) { dout(7) << "discovering root" << endl; - opening_root = true; + filepath want; MDiscover *req = new MDiscover(whoami, - string(""), - NULL); + 0, + want); mds->messenger->send_message(req, MSG_ADDR_MDS(0), MDS_PORT_CACHE, MDS_PORT_CACHE); } else { dout(7) << "waiting for root" << endl; } + + // wait + waiting_for_root.push_back(c); + } } @@ -487,6 +496,9 @@ int MDCache::proc_message(Message *m) case MSG_MDS_DISCOVER: handle_discover((MDiscover*)m); break; + case MSG_MDS_DISCOVERREPLY: + handle_discover_reply((MDiscoverReply*)m); + break; case MSG_MDS_INODEUPDATE: @@ -556,8 +568,6 @@ int MDCache::proc_message(Message *m) handle_export_dir_notify_ack((MExportDirNotifyAck*)m); break; - - // export 3rd party (inode authority) case MSG_MDS_EXPORTDIRNOTIFY: handle_export_dir_notify((MExportDirNotify*)m); @@ -582,13 +592,14 @@ int MDCache::proc_message(Message *m) * 0 : success * >0 : delayed or forwarded */ -int MDCache::path_traverse(string& pathname, +int MDCache::path_traverse(filepath& path, vector& trace, Message *req, int onfail) { int whoami = mds->get_nodeid(); + // root CInode *cur = get_root(); if (cur == NULL) { dout(7) << "mds" << whoami << " i don't have root" << endl; @@ -597,124 +608,136 @@ int MDCache::path_traverse(string& pathname, return 1; } - // break path into bits. + // start trace trace.clear(); trace.push_back(cur); - // get read access - //if (read_wait(cur, req)) - // return 1; // wait - string have_clean; - filepath path = pathname; - for (int depth = 0; depth < path.depth(); depth++) { - string dname = path[depth]; - //dout(7) << " path seg " << dname << endl; + //dout(7) << " path seg " << path[depth] << endl; + + if (!cur->is_dir()) { + dout(7) << *cur << " not a dir " << cur->inode.isdir << endl; + return -ENOTDIR; + } - // lookup dentry - if (cur->is_dir()) { - if (!cur->dir) - cur->dir = new CDir( cur, whoami ); - - // frozen? - if (cur->dir->is_frozen_tree_root() || - cur->dir->is_frozen_dir()) { - // doh! - // FIXME: traverse is allowed? - dout(7) << *cur->dir << " is frozen, waiting" << endl; - cur->dir->add_waiter(CDIR_WAIT_UNFREEZE, + // open dir + if (!cur->dir) { + if (cur->dir_is_auth()) { + cur->get_or_open_dir(mds); + } else { + // discover dir from/via inode auth + assert(!cur->is_auth()); + filepath want = path.subpath(depth); + mds->messenger->send_message(new MDiscover(mds->get_nodeid(), + cur->ino(), + want, + true), + MSG_ADDR_MDS(cur->authority()), MDS_PORT_CACHE, + MDS_PORT_CACHE); + cur->dir->add_waiter(CINODE_WAIT_DIR, + path[depth], new C_MDS_RetryMessage(mds, req)); - return 1; } - - // must read hard data to traverse - if (!read_hard_try(cur, req)) - return 1; - - // check permissions? - - - // dentry: - CDentry *dn = cur->dir->lookup(dname); - if (dn && dn->inode) { - // have it, keep going. - cur = dn->inode; - have_clean += "/"; - have_clean += dname; + } + + // frozen? + if (cur->dir->is_frozen()) { + // doh! + // FIXME: traverse is allowed? + dout(7) << *cur->dir << " is frozen, waiting" << endl; + cur->dir->add_waiter(CDIR_WAIT_UNFREEZE, + new C_MDS_RetryMessage(mds, req)); + return 1; + } + + // must read hard data to traverse + if (!read_hard_try(cur, req)) + return 1; + + // check permissions? + + + // dentry + CDentry *dn = cur->dir->lookup(path[depth]); + if (dn && dn->inode) { + // have it, keep going. + cur = dn->inode; + have_clean += "/"; + have_clean += path[depth]; + + trace.push_back(cur); + continue; + } + + // don't have it. + int dauth = cur->dir->dentry_authority( path[depth] ); + + if (dauth == whoami) { + // mine. + if (cur->dir->is_complete()) { + // file not found + return -ENOENT; } else { - // don't have it. - int dauth = cur->dir->dentry_authority( dname, mds->get_cluster() ); - - if (dauth == whoami) { - // mine. - if (cur->dir->is_complete()) { - // file not found - return -ENOENT; - } else { - if (onfail == MDS_TRAVERSE_DISCOVER) - return -1; - - // directory isn't complete; reload - dout(7) << "mds" << whoami << " incomplete dir contents for " << *cur << ", fetching" << endl; - lru->lru_touch(cur); // touch readdiree - mds->mdstore->fetch_dir(cur, new C_MDS_RetryMessage(mds, req)); - - mds->logger->inc("cmiss"); - mds->logger->inc("rdir"); - return 1; - } - } else { - // not mine. - - if (onfail == MDS_TRAVERSE_DISCOVER) { - // discover - dout(7) << " discover on " << *cur << " for " << dname << "..., to mds" << dauth << endl; - - // assemble+send request - vector *want = new vector; - for (int i=depth; ipush_back(path[i]); - - lru->lru_touch(cur); // touch discoveree - - mds->messenger->send_message(new MDiscover(whoami, have_clean, want), - MSG_ADDR_MDS(dauth), MDS_PORT_CACHE, - MDS_PORT_CACHE); - - // delay processing of current request - cur->dir->add_waiter(CDIR_WAIT_DENTRY, dname, new C_MDS_RetryMessage(mds, req)); - - mds->logger->inc("dis"); - mds->logger->inc("cmiss"); - return 1; - } - if (onfail == MDS_TRAVERSE_FORWARD) { - // forward - dout(7) << " not authoritative for " << dname << ", fwd to mds" << dauth << endl; - mds->messenger->send_message(req, - MSG_ADDR_MDS(dauth), req->get_dest_port(), - req->get_dest_port()); - //show_imports(); - - mds->logger->inc("cfw"); - return 1; - } - if (onfail == MDS_TRAVERSE_FAIL) { - return -1; // -ENOENT, but only because i'm not the authority - } - } + if (onfail == MDS_TRAVERSE_DISCOVER) + return -1; + + // directory isn't complete; reload + dout(7) << " incomplete dir contents for " << *cur << ", fetching" << endl; + lru->lru_touch(cur); // touch readdiree + mds->mdstore->fetch_dir(cur->dir, new C_MDS_RetryMessage(mds, req)); + + mds->logger->inc("cmiss"); + mds->logger->inc("rdir"); + return 1; } } else { - dout(7) << *cur << " not a dir " << cur->inode.isdir << endl; - return -ENOTDIR; + // not mine. + + if (onfail == MDS_TRAVERSE_DISCOVER) { + // discover + filepath want = path.subpath(depth); + + dout(7) << " discover on " << *cur << " for " << want.get_path() << " to mds" << dauth << endl; + + lru->lru_touch(cur); // touch discoveree + + mds->messenger->send_message(new MDiscover(mds->get_nodeid(), + cur->ino(), + want, + false), + MSG_ADDR_MDS(dauth), MDS_PORT_CACHE, + MDS_PORT_CACHE); + + // delay processing of current request + cur->dir->add_waiter(CDIR_WAIT_DENTRY, + path[depth], + new C_MDS_RetryMessage(mds, req)); + + mds->logger->inc("dis"); + mds->logger->inc("cmiss"); + return 1; + } + if (onfail == MDS_TRAVERSE_FORWARD) { + // forward + dout(7) << " not auth for " << path[depth] << ", fwd to mds" << dauth << endl; + mds->messenger->send_message(req, + MSG_ADDR_MDS(dauth), req->get_dest_port(), + req->get_dest_port()); + //show_imports(); + + mds->logger->inc("cfw"); + return 1; + } + if (onfail == MDS_TRAVERSE_FAIL) { + return -1; // -ENOENT, but only because i'm not the authority + } } - trace.push_back(cur); - //read_wait(cur, req); // wait for read access + assert(0); // i shouldn't get here } - + // success. return 0; } @@ -722,12 +745,10 @@ int MDCache::path_traverse(string& pathname, - - // REPLICAS -int MDCache::handle_discover(MDiscover *dis) +void MDCache::handle_discover(MDiscover *dis) { int whoami = mds->get_nodeid(); @@ -737,15 +758,15 @@ int MDCache::handle_discover(MDiscover *dis) // get started. if (dis->get_base_ino() == 0) { // wants root - dout(7) << "handle_discover from mds" << dis->get_asker() << " wants root + " << m->get_want().get_path() << endl; + dout(7) << "handle_discover from mds" << dis->get_asker() << " wants root + " << dis->get_want().get_path() << endl; assert(mds->get_nodeid() == 0); assert(root->is_auth()); // add root reply = new MDiscoverReply(0); - reply->add_inode( CInodeDiscover( root, - root->cached_by_add( m->get_asker() ) ) ); + reply->add_inode( new CInodeDiscover( root, + root->cached_by_add( dis->get_asker() ) ) ); dir = root->dir; @@ -754,7 +775,7 @@ int MDCache::handle_discover(MDiscover *dis) CInode *in = get_inode(dis->get_base_ino()); assert(in); - dout(7) << "handle_discover from mds" << dis->get_asker() << " has " << *in << " wants " << m->get_want().get_path() << endl; + dout(7) << "handle_discover from mds" << dis->get_asker() << " has " << *in << " wants " << dis->get_want().get_path() << endl; assert(in->is_dir()); @@ -764,7 +785,7 @@ int MDCache::handle_discover(MDiscover *dis) if (dir->is_proxy() && !dir->is_hashed()) { // fwd to dir auth dout(7) << "i am proxy, fwd to dir_auth " << dir->authority() << endl; - mds->messenger->send_message( m, + mds->messenger->send_message( dis, MSG_ADDR_MDS( dir->authority() ), MDS_PORT_CACHE, MDS_PORT_CACHE ); return; } @@ -787,9 +808,20 @@ int MDCache::handle_discover(MDiscover *dis) dout(7) << *dir << " auth is " << dir->authority() << ", i'm done" << endl; break; } + + // frozen? + /* hmmm do we care, actually? + if (dir->is_frozen()) { + dout(7) << *dir << " frozen, waiting" << endl; + dir->add_waiter(new C_MDS_RetryMessage( dis, mds )); + delete reply; + return; + } + */ + dout(7) << "adding dir " << *dir << endl; - reply.add_dir( CDirDiscover( dir, - dir->open_by_add( m->get_asker() ) ) ); + reply->add_dir( new CDirDiscover( dir, + dir->open_by_add( dis->get_asker() ) ) ); } // lookup dentry @@ -808,12 +840,13 @@ int MDCache::handle_discover(MDiscover *dis) // add dentry + inode dout(7) << "adding dentry " << dn << " + " << *next << endl; reply->add_dentry( dis->get_dentry(i) ); - reply->add_inode( CInodeDiscover(next, - next->cached_by_add(dis->get_asker())) ); + reply->add_inode( new CInodeDiscover(next, + next->cached_by_add(dis->get_asker())) ); } else { // don't have it? - if (cur->dir->is_complete()) { + if (dir->is_complete()) { // ... + // i shoudl return an error falg in the reply... FIXME assert(0); // for now } else { delete reply; @@ -821,13 +854,13 @@ int MDCache::handle_discover(MDiscover *dis) // readdir dout(7) << "mds" << whoami << " incomplete dir contents for " << *dir << ", fetching" << endl; mds->mdstore->fetch_dir(dir, new C_MDS_RetryMessage(mds, dis)); - return 0; + return; } } } // how did we do. - if (reply.is_empty()) { + if (reply->is_empty()) { // discard empty reply delete reply; @@ -835,7 +868,7 @@ int MDCache::handle_discover(MDiscover *dis) if (dir->is_proxy()) { // fwd to auth dout(7) << "i am dir proxy, fwd to auth " << dir->authority() << endl; - mds->messenger->send_message( m, + mds->messenger->send_message( dis, MSG_ADDR_MDS( dir->authority() ), MDS_PORT_CACHE, MDS_PORT_CACHE ); return; } @@ -844,9 +877,9 @@ int MDCache::handle_discover(MDiscover *dis) } else { // send back to asker - dout(7) << "sending result back to asker " << m->get_asker() << endl; + dout(7) << "sending result back to asker " << dis->get_asker() << endl; mds->messenger->send_message(reply, - m->get_asker(), MDS_PORT_CACHE, MDS_PORT_CACHE); + dis->get_asker(), MDS_PORT_CACHE, MDS_PORT_CACHE); } // done. @@ -854,162 +887,109 @@ int MDCache::handle_discover(MDiscover *dis) } -int MDCache::handle_discover_reply(MDiscoverReply *dis) +void MDCache::handle_discover_reply(MDiscoverReply *m) { - - - // this is a result - vector trace = dis->get_trace(); + // starting point + CInode *cur; + + if (m->has_root()) { + // nowhere! + dout(7) << "handle_discover_reply root + " << m->get_path() << endl; + assert(!root); + } else { + // grab inode + cur = get_inode(m->get_base_ino()); - if (dis->just_root()) { - dout(7) << "handle_discover got root" << endl; - - CInode *root = new CInode(); - root->inode = trace[0].inode; - root->cached_by = trace[0].cached_by; - root->cached_by.insert(whoami); // obviously i have it too - root->dir_auth = trace[0].dir_auth; - root->dir = new CDir(root, whoami); // not auth - assert(!root->dir->is_auth()); - root->dir->dir_rep = trace[0].dir_rep; - root->dir->dir_rep_by = trace[0].dir_rep_by; - root->state_set(CINODE_STATE_ROOT); - root->set_auth(false); - root->replica_nonce = trace[0].replica_nonce; - assert(root->replica_nonce == CINODE_ROOT_NONCE); - - if (trace[0].is_syncbyauth) root->dist_state |= CINODE_DIST_SYNCBYAUTH; - if (trace[0].is_softasync) root->dist_state |= CINODE_DIST_SOFTASYNC; - if (trace[0].is_lockbyauth) root->dist_state |= CINODE_DIST_LOCKBYAUTH; - - set_root( root ); - - opening_root = false; - - // done - delete dis; - - // finish off. - list finished; - finished.splice(finished.end(), waiting_for_root); - - list::iterator it; - for (it = finished.begin(); it != finished.end(); it++) { - Context *c = *it; - c->finish(0); - delete c; - } - - return 0; + if (!cur) { + dout(7) << "handle_discover_reply don't have base ino " << m->get_base_ino() << ", dropping" << endl; + delete m; + return; } - // traverse to start point - vector trav; + dout(7) << "handle_discover_reply " << *cur << " + " << m->get_path() << endl; + } + + list finished; + + for (int i=0; iget_num_inodes(); i++) { + // dir + if (i || m->has_base_dir()) { + if (cur->dir) { + // had it + dout(7) << "had " << *cur->dir; + m->get_dir(i).update_dir(cur->dir); + dout2(7) << ", now " << *cur->dir << endl; + } else { + // add it (_replica_) + cur->set_dir( new CDir(cur, mds, false) ); + m->get_dir(i).update_dir(cur->dir); + dout(7) << "added " << *cur->dir; + } + } - dout(7) << "handle_discover got result" << endl; - - int r = path_traverse(dis->get_basepath(), trav, dis, MDS_TRAVERSE_FAIL); // FIXME BUG?? - if (r < 0) { - dout(1) << "handle_discover result, but not in cache any more. dropping." << endl; - delete dis; - return 0; - } - if (r > 0) { - dout(7) << "waiting for something" << endl; - return 0; + // lookup dentry + CInode *in = 0; + if (i || m->has_base_dentry()) { + CDentry *dn = cur->dir->lookup( m->get_dentry(i) ); + if (dn) in = dn->get_inode(); } - CInode *cur = trav[trav.size()-1]; - CInode *start = cur; - - vector *wanted = dis->get_want(); - - list finished; - - // add duplicated dentry+inodes - for (int i=0; idir) cur->dir = new CDir(cur, whoami); - - CInode *in; - CDentry *dn = cur->dir->lookup( (*wanted)[i] ); + // inode + if (in) { + // had it + assert(in == get_inode( m->get_ino(i) )); - int dentry_auth = cur->dir->dentry_authority( dn->name, mds->get_cluster() ); - - if (dn) { - // already had it? (parallel discovers?) - dout(7) << "huh, already had " << (*wanted)[i] << endl; - in = dn->inode; - } else { - if (dentry_auth == whoami) { - // uh oh, discover has something that's ours, and we don't have. readdir and delay! - dout(3) << "huh, dentry has item " << *cur << " dentry " << dn->name << ", which is ours, but we don't have. fetching dir!" << endl; - mds->mdstore->fetch_dir(cur, - new C_MDS_RetryMessage(mds, dis)); - return 0; - } - + dout(7) << "had " << *in; + m->get_inode(i).update_inode(in); + dout2(7) << ", now " << *in << endl; - in = new CInode(); - - // assim discover info - in->inode = trace[i].inode; - in->cached_by = trace[i].cached_by; - in->cached_by.insert(whoami); // obviously i have it too - in->dir_auth = trace[i].dir_auth; // note: simplified, see below - - in->auth = false; - in->replica_nonce = trace[i].replica_nonce; - - if (in->is_dir()) { - in->dir = new CDir(in, whoami); // can't be ours (an import) or it'd be in our cache. - assert(!in->dir->is_auth()); - in->dir->dir_rep = trace[i].dir_rep; - in->dir->dir_rep_by = trace[i].dir_rep_by; - assert(!in->dir->is_auth()); - } + } else { + // add inode + in = new CInode(false); - if (trace[i].is_syncbyauth) in->dist_state |= CINODE_DIST_SYNCBYAUTH; - if (trace[i].is_softasync) in->dist_state |= CINODE_DIST_SOFTASYNC; - if (trace[i].is_lockbyauth) in->dist_state |= CINODE_DIST_LOCKBYAUTH; + m->get_inode(0).update_inode(in); + + if (!i && m->has_root()) { + // root + in->state_set(CINODE_STATE_ROOT); + set_root( in ); + finished.splice(finished.end(), waiting_for_root); + } else { // link in add_inode( in ); - link_inode( cur, (*wanted)[i], in ); - - // simplify dir_auth - if (in->authority(mds->get_cluster()) == trace[i].dir_auth) - in->dir_auth = CDIR_AUTH_PARENT; - - dout(7) << " discover assimilating " << *in << endl; + link_inode( cur->dir, m->get_dentry(i), in ); + + cur->dir->take_waiting(CDIR_WAIT_DENTRY, + m->get_dentry(i), + finished); } - cur->dir->take_waiting(CDIR_WAIT_DENTRY, - (*wanted)[i], - finished); - - cur = in; + dout(7) << "added " << *in << endl; } + + // onward! + cur = in; + } - // done - delete dis; - - // finish off waiting items - dout(7) << " i have " << finished.size() << " contexts to finish" << endl; - list::iterator it; - for (it = finished.begin(); it != finished.end(); it++) { - Context *c = *it; - c->finish(0); - delete c; - } - + // finish + dout(7) << finished.size() << " contexts to finish" << endl; + list::iterator it; + for (it = finished.begin(); + it != finished.end(); + it++) { + Context *c = *it; + c->finish(0); + delete c; + } + // done + delete m; } - void MDCache::handle_inode_get_replica(MInodeGetReplica *m) { CInode *in = get_inode(m->get_ino()); @@ -1411,7 +1391,7 @@ bool MDCache::read_soft_start(CInode *in, Message *m) !in->is_open_write()) goto yes; // i'm alone } else { // i am replica: fw to auth - int auth = in->authority(mds->get_cluster()); + int auth = in->authority(); dout(5) << "read_soft_start " << *in << " is softasync, fw to auth " << auth << endl; assert(auth != mds->get_nodeid()); mds->messenger->send_message(m, @@ -1516,7 +1496,7 @@ bool MDCache::write_soft_start(CInode *in, Message *m) !in->is_open_write()) goto yes; // i'm alone } else { // i am replica: fw to auth - int auth = in->authority(mds->get_cluster()); + int auth = in->authority(); dout(5) << "write_soft_start " << *in << " is !softasync, fw to auth " << auth << endl; assert(auth != mds->get_nodeid()); mds->messenger->send_message(m, @@ -1582,7 +1562,7 @@ void MDCache::inode_sync_wait(CInode *in) { assert(!in->is_auth()); - int auth = in->authority(mds->get_cluster()); + int auth = in->authority(); dout(5) << "inode_sync_wait on " << *in << ", auth " << auth << endl; assert(in->is_syncbyauth()); @@ -1671,11 +1651,12 @@ void MDCache::inode_sync_release(CInode *in) } +/* void MDCache::update_replica_auth(CInode *in, int realauth) { assert(0); // this is all hashing crap, fixme - int myauth = in->authority(mds->get_cluster()); + int myauth = in->authority(); if (myauth != realauth) { dout(7) << "update_replica_auth " << *in << " real auth is " << realauth << " not " << myauth << ", fiddling dir_auth" << endl; CDir *dir = in->get_parent_dir(); @@ -1684,11 +1665,12 @@ void MDCache::update_replica_auth(CInode *in, int realauth) // let's just change ownership of this dir. assert(!dir->inode->is_auth()); // i would already have correct info if dir inode were mine - dir->inode->dir_auth = realauth; - assert(in->authority(mds->get_cluster()) == realauth); // double check our work. + dir->dir_auth = realauth; + assert(in->authority() == realauth); // double check our work. assert(!dir->is_auth()); // make sure i'm still not auth for the dir. } } +*/ int MDCache::ino_proxy_auth(inodeno_t ino, int frommds) { @@ -1702,7 +1684,7 @@ int MDCache::ino_proxy_auth(inodeno_t ino, int frommds) // is this ino in the set? if (export_proxy_inos[dir].count(ino)) { - int dirauth = dir->inode->dir_authority(mds->get_cluster()); + int dirauth = dir->authority(); assert(dirauth >= 0); return dirauth; } @@ -1734,7 +1716,7 @@ void MDCache::handle_inode_sync_start(MInodeSyncStart *m) assert(!in->is_auth()); // sanity check: make sure we know who _is_ authoritative! - assert(m->get_asker() == in->authority(mds->get_cluster())); + assert(m->get_asker() == in->authority()); // lock it in->dist_state |= CINODE_DIST_SYNCBYAUTH; @@ -2009,7 +1991,7 @@ bool MDCache::write_hard_start(CInode *in, } else { // replica // fw to auth - int auth = in->authority(mds->get_cluster()); + int auth = in->authority(); dout(5) << "write_hard_start " << *in << " on replica, fw to auth " << auth << endl; assert(auth != mds->get_nodeid()); mds->messenger->send_message(m, @@ -2123,7 +2105,7 @@ void MDCache::handle_inode_lock_start(MInodeLockStart *m) in->dist_state |= CINODE_DIST_LOCKBYAUTH; // sanity check: make sure we know who _is_ authoritative! - assert(m->get_asker() == in->authority(mds->get_cluster())); + assert(m->get_asker() == in->authority()); // send ack mds->messenger->send_message(new MInodeLockAck(in->ino()), @@ -2347,7 +2329,7 @@ public: // redelegate list::iterator it; for (it = will_redelegate.begin(); it != will_redelegate.end(); it++) { - (*it)->redelegate(mds, ex->inode->dir_authority(mds->get_cluster())); + (*it)->redelegate(mds, ex->authority()); delete *it; // delete context } @@ -2388,7 +2370,7 @@ void MDCache::export_dir(CDir *dir, // send ExportDirPrep (ask target) dout(7) << "export_dir " << *dir << " to " << dest << ", sending ExportDirPrep" << endl; - mds->messenger->send_message(export_dir_prep(dir), + mds->messenger->send_message(export_dir_prep(dir, dest), dest, MDS_PORT_CACHE, MDS_PORT_CACHE); dir->auth_pin(); // pin dir, to hang up our freeze mds->logger->inc("ex"); @@ -2435,6 +2417,7 @@ MDCache::export_dir_prep(CDir *dir, dout(7) << "including nested export " << *exp << " in prep" << endl; + // ** write me ** } @@ -2452,7 +2435,7 @@ void MDCache::export_dir_dropsync(CDir *dir) for (it = dir->begin(); it != dir->end(); it++) { CInode *in = it->second->inode; - if (in->is_syncbyme()) { + if (in->is_auth() && in->is_syncbyme()) { dout(7) << "about to export: dropping sticky(?) sync on " << *in << endl; inode_sync_release(in); } @@ -2499,7 +2482,7 @@ void MDCache::export_dir_frozen(CDir *dir, dout(7) << " i'm rexporting a previous import" << endl; imports.erase(dir); dir->state_clear(CDIR_STATE_IMPORT); - dir->put(CINODE_PIN_IMPORT); // unpin, no longer an import + dir->put(CDIR_PIN_IMPORT); // unpin, no longer an import // discard nested exports (that we're handing off pair::iterator, multimap::iterator> p = @@ -2519,7 +2502,7 @@ void MDCache::export_dir_frozen(CDir *dir, exports.insert(dir); nested_exports.insert(pair(containing_import, dir)); - dir->get(CINODE_PIN_EXPORT); // i must keep it pinned + dir->get(CDIR_PIN_EXPORT); // i must keep it pinned // discard nested exports (that we're handing off) pair::iterator, multimap::iterator> p = @@ -2552,7 +2535,6 @@ void MDCache::export_dir_frozen(CDir *dir, else dir->dir_auth = dest; - // build export message MExportDir *req = new MExportDir(dir->inode, pop); // include pop @@ -2564,7 +2546,7 @@ void MDCache::export_dir_frozen(CDir *dir, dir, // base dir, // recur start point dest ); - + // send the export data! mds->messenger->send_message(req, MSG_ADDR_MDS(dest), MDS_PORT_CACHE, @@ -2574,11 +2556,13 @@ void MDCache::export_dir_frozen(CDir *dir, dir->add_waiter( CDIR_WAIT_UNFREEZE, fin ); // make list of nodes i expect an export_dir_notify_ack from - // (everybody but me!) + // (all w/ this dir open, but me!) assert(export_notify_ack_waiting[dir].empty()); - for (int i=0; i < mds->get_cluster()->get_num_mds(); i++) { - if (i == mds->get_nodeid()) continue; - export_notify_ack_waiting[dir].insert(i); + for (set::iterator it = dir->open_by.begin(); + it != dir->open_by.end(); + it++) { + if (*it == mds->get_nodeid()) continue; + export_notify_ack_waiting[dir].insert( *it ); } } @@ -2590,41 +2574,30 @@ void MDCache::export_dir_walk(MExportDir *req, CDir *dir, int newauth) { - dout(7) << "export_dir_walk on " << *dir << " " << dir->nitems << " items" << endl; + dout(7) << "export_dir_walk " << *dir << " " << dir->nitems << " items" << endl; // dir crope dir_rope; - Dir_Export_State_t dstate; - dstate.ino = dir->ino(); - dstate.nitems = dir->nitems; - dstate.version = dir->version; - dstate.state = dir->state; - dstate.dir_rep = dir->dir_rep; - dstate.ndir_rep_by = dir->dir_rep_by.size(); - dstate.popularity = dir->popularity[0]; // FIXME: rest of vector? - dir_rope.append( (char*)&dstate, sizeof(dstate) ); + CDirExport dstate(dir); + dir_rope.append( dstate._rope() ); - for (set::iterator it = dir->dir_rep_by.begin(); - it != dir->dir_rep_by.end(); - it++) { - int i = *it; - dir_rope.append( (char*)&i, sizeof(int) ); - } - // mark assert(dir->is_auth()); dir->state_clear(CDIR_STATE_AUTH); - + // discard most dir state dir->state &= CDIR_MASK_STATE_EXPORT_KEPT; // i only retain a few things. + + // proxy + dir->state_set(CDIR_STATE_PROXY); + dir->get(CDIR_PIN_PROXY); // suck up all waiters list waiting; dir->take_waiting(CINODE_WAIT_ANY, waiting); // FIXME ?? actually this is okay? fin->assim_waitlist(waiting); - - + // inodes list subdirs; @@ -2634,32 +2607,17 @@ void MDCache::export_dir_walk(MExportDir *req, in->version++; // so log entries are ignored, etc. - // idir hashed? make dir_auth explicit - if (dir->is_hashed() && - in->dir_auth == CDIR_AUTH_PARENT) - in->dir_auth = mds->get_nodeid(); + // dentry + dir_rope.append( it->first.c_str(), it->first.length()+1 ); - // if hashed, only include dirs i'm authoritative for - if (!dir->is_hashed() || (in->is_dir() && in->is_auth())) { - // dentry - dir_rope.append( it->first.c_str(), it->first.length()+1 ); - - // add inode - dir_rope.append( in->encode_export_state() ); - } + // add inode + CInodeExport istate( in ); + dir_rope.append( istate._rope() ); if (in->is_dir()) { - // sanity check - if (in->dir_is_hashed()) { - dout(7) << " encountered hashed dir " << *in->dir << endl; - assert(!in->dir || in->dir->is_hashed()); - } else - assert(!in->dir || !in->dir->is_hashed()); - // recurse? if (in->dir) { - if (in->dir->is_auth()) { // nested subdir assert(in->dir->dir_auth == CDIR_AUTH_PARENT); @@ -2680,47 +2638,31 @@ void MDCache::export_dir_walk(MExportDir *req, } } - // we don't export inodes if hashed - if (dir->is_hashed()) { - // but we do replicate my dirs on new auth - if (in->is_dir()) { - if (in->is_auth()) { - // it's mine, easy enough: new auth will replicate my inode (i included it above) - if (!in->is_cached_by(newauth)) - in->cached_by_add( newauth, CINODE_HASHREPLICA_NONCE ); - } - else { - // i'm a replica. the recipient had better discover this dir. - req->add_prediscover(dir->ino(), it->first); - } - } - } else { - // we're export this inode; fix inode state - dout(7) << "export_dir_walk exporting " << *in << endl; + // we're export this inode; fix inode state + dout(7) << "export_dir_walk exporting " << *in << endl; - if (in->is_dirty()) in->mark_clean(); + if (in->is_dirty()) in->mark_clean(); - // clear/unpin cached_by (we're no longer the authority) - in->cached_by_clear(); + // clear/unpin cached_by (we're no longer the authority) + in->cached_by_clear(); - // mark auth - assert(in->is_auth()); - in->set_auth(false); + // mark auth + assert(in->is_auth()); + in->set_auth(false); - in->replica_nonce = CINODE_EXPORT_NONCE; + in->replica_nonce = CINODE_EXPORT_NONCE; - // add to proxy - export_proxy_inos[basedir].insert(in->ino()); - in->state_set(CINODE_STATE_PROXY); - in->get(CINODE_PIN_PROXY); - - // *** other state too? + // add to proxy + export_proxy_inos[basedir].insert(in->ino()); + in->state_set(CINODE_STATE_PROXY); + in->get(CINODE_PIN_PROXY); - // waiters - list waiters; - dir->take_waiting(CDIR_WAIT_ANY, waiters); - fin->assim_waitlist(waiters); - } + // *** other state too? + + // waiters + list waiters; + dir->take_waiting(CDIR_WAIT_ANY, waiters); + fin->assim_waitlist(waiters); } req->add_dir( dir_rope ); @@ -2737,7 +2679,7 @@ void MDCache::export_dir_walk(MExportDir *req, void MDCache::handle_export_dir_notify_ack(MExportDirNotifyAck *m) { CInode *idir = get_inode(m->get_ino()); - CDir *dir = idir->get_dir(mds->get_nodeid()); + CDir *dir = idir->dir; assert(dir); assert(dir->is_frozen_tree_root()); // i'm exporting! @@ -2759,6 +2701,7 @@ void MDCache::handle_export_dir_notify_ack(MExportDirNotifyAck *m) export_notify_ack_waiting.erase(dir); // unpin proxies! + // inodes for (set::iterator it = export_proxy_inos[dir].begin(); it != export_proxy_inos[dir].end(); it++) { @@ -2768,6 +2711,16 @@ void MDCache::handle_export_dir_notify_ack(MExportDirNotifyAck *m) } export_proxy_inos.erase(dir); + // dirs + for (set::iterator it = export_proxy_dirinos[dir].begin(); + it != export_proxy_dirinos[dir].end(); + it++) { + CDir *dir = get_inode(*it)->dir; + dir->put(CDIR_PIN_PROXY); + dir->state_clear(CDIR_STATE_PROXY); + } + export_proxy_dirinos.erase(dir); + // finish export export_dir_finish(dir); } @@ -2785,7 +2738,7 @@ void MDCache::export_dir_finish(CDir *dir) // send finish to new auth mds->messenger->send_message(new MExportDirFinish(dir->ino()), - MSG_ADDR_MDS(dir->inode->dir_authority(mds->get_cluster())), + MSG_ADDR_MDS(dir->authority()), MDS_PORT_CACHE, MDS_PORT_CACHE); // unfreeze @@ -2814,7 +2767,7 @@ void MDCache::handle_export_dir_prep(MExportDirPrep *m) // must discover it! vector trav; - int r = path_traverse(m->get_path(), trav, m, MDS_TRAVERSE_DISCOVER); + int r = path_traverse(m->get_filepath(), trav, m, MDS_TRAVERSE_DISCOVER); if (r > 0) { dout(7) << "handle_export_dir_prep on " << m->get_path() << ", doing discover|fetch" << endl; return; // fw or delay @@ -2839,7 +2792,6 @@ void MDCache::handle_export_dir_prep(MExportDirPrep *m) // *** open all export dirs - // ok! dout(7) << "sending export_dir_prep_ack on " << *dir << endl; mds->messenger->send_message(new MExportDirPrepAck(dir->ino()), @@ -2978,9 +2930,14 @@ void MDCache::handle_export_dir(MExportDir *m) list imported_subdirs; // add this crap to my cache - const char *p = m->get_state().c_str(); + int off = 0; for (int i = 0; i < m->get_ndirs(); i++) { - import_dir_block(p, containing_import, oldauth, dir, imported_subdirs); + import_dir_block(m->get_state(), + off, + oldauth, + containing_import, + dir, // import root + imported_subdirs); } dout(7) << " " << imported_subdirs.size() << " imported subdirs" << endl; @@ -3008,8 +2965,8 @@ void MDCache::handle_export_dir(MExportDir *m) MDS_PORT_CACHE); dout(7) << "sending notify to others" << endl; - for (set::iterator it = open_by.begin(); - it != open_by.end(); + for (set::iterator it = dir->open_by.begin(); + it != dir->open_by.end(); it++) { assert( *it != mds->get_nodeid() ); if ( *it == m->get_source() ) continue; // not to old auth. @@ -3111,7 +3068,7 @@ CInode *MDCache::import_dentry_inode(CDir *dir, else if (dir->is_hashed()) { // import on hashed dir assert(in->is_dir()); - if (in->authority(mds->get_cluster()) == mds->get_nodeid()) + if (in->authority() == mds->get_nodeid()) in->set_auth(true); else in->set_auth(false); @@ -3127,7 +3084,7 @@ CInode *MDCache::import_dentry_inode(CDir *dir, if (!had_inode) { // add add_inode(in); - link_inode(dir->inode, dname, in); + link_inode(dir, dname, in); dout(7) << " import_dentry_inode adding " << *in << " istate.dir_auth " << istate->dir_auth << endl; } else { dout(7) << " import_dentry_inode already had " << *in << " istate.dir_auth " << istate->dir_auth << endl; @@ -3185,7 +3142,7 @@ CInode *MDCache::import_dentry_inode(CDir *dir, dout(10) << " import_dir_auth !importing. hashed dir." << endl; - int auth = in->authority(mds->get_cluster()); + int auth = in->authority(); if (in->is_auth()) { assert(in->is_cached_by(from)); @@ -3208,39 +3165,31 @@ CInode *MDCache::import_dentry_inode(CDir *dir, } -void MDCache::import_dir_block(pchar& p, - CDir *containing_import, +void MDCache::import_dir_block(crope& r, + int& off, int oldauth, + CDir *containing_import, CDir *import_root, list& imported_subdirs) { // set up dir - Dir_Export_State_t *dstate = (Dir_Export_State_t*)p; - CInode *idir = get_inode(dstate->ino); + CDirExport dstate; + off = dstate._unrope(r, off); + + CInode *idir = get_inode(dstate->get_ino()); assert(idir); - CDir *dir = idir->get_dir(mds->get_nodeid()); + CDir *dir = idir->get_or_open_dir(mds); + assert(dir); + dout(7) << " import_dir_block " << *dir << " " << dstate->nitems << " items" << endl; + + // assimilate state + dstate.update_dir( dir ); + + // mark (may already be marked from get_or_open_dir() above) + if (!dir->is_auth()) + dir->state_set(CDIR_STATE_AUTH); - if (idir->dir_is_hashed()) { - // hashed: not importing much of anything - assert(dir->is_hashed()); // i should already know! - } else { - // normal - dir->state = (dir->state & CDIR_MASK_STATE_IMPORT_KEPT) | // remember import flag, etc. - (dstate->state & CDIR_MASK_STATE_EXPORTED); // import dirty, complete, etc. - dir->version = dstate->version; - } - dir->dir_rep = dstate->dir_rep; - dir->popularity[0] = dstate->popularity; // FIXME rest of vector? - - //assert(!dir->is_auth()); // inode may have twiddled. - dir->state_set(CDIR_STATE_AUTH); - - p += sizeof(*dstate); - for (int nrep = dstate->ndir_rep_by; nrep > 0; nrep--) { - dir->dir_rep_by.insert( *((int*)p) ); - p += sizeof(int); - } // take all waiters on this dir // NOTE: a pass of imported data is guaranteed to get all of my waiters because @@ -3293,13 +3242,13 @@ void MDCache::import_dir_block(pchar& p, // de-list under old import nested_exports.erase(in->dir); - in->dir_auth = CDIR_AUTH_PARENT; - in->put(CINODE_PIN_IMPORT); // imports are pinned, no longer import + in->dir->dir_auth = CDIR_AUTH_PARENT; + in->dir->put(CDIR_PIN_IMPORT); // imports are pinned, no longer import } else { dout(7) << " importing nested export " << *in->dir << " to " << in->dir_auth << endl; // add this export - in->get(CINODE_PIN_EXPORT); // all exports are pinned + in->dir->get(CDIR_PIN_EXPORT); // all exports are pinned exports.insert(in->dir); nested_exports.insert(pair(containing_import, in->dir)); mds->logger->inc("imex"); @@ -3369,7 +3318,23 @@ void MDCache::handle_export_dir_warning(MExportDirWarning *m) { CInode *in = get_inode(m->get_ino()); if (!in) { - dout(7) << "handle_export_dir_warning on " << m->get_ino() << ", don't have it" << endl; + // add to stray warning list + stray_export_warnings.insert( m->get_ino() ); + + // did i already see the notify? + if (stray_export_notifies.count(m->get_ino())) { + // i did, we're good. + dout(7) << "handle_export_dir_warning on " << m->get_ino() << ". already got notify." << endl; + + // process the notify + map::iterator it = stray_export_notifies.find(m->get_ino()); + handle_export_dir_notify(it->second); + stray_export_notifies.erase(it); + + } else { + dout(7) << "handle_export_dir_warning on " << m->get_ino() << ". waiting for notify." << endl; + } + delete m; return; } @@ -3390,52 +3355,68 @@ void MDCache::handle_export_dir_warning(MExportDirWarning *m) void MDCache::handle_export_dir_notify(MExportDirNotify *m) { + CDir *dir = 0; CInode *in = get_inode(m->get_ino()); - if (!in) { - dout(7) << "handle_export_dir_notify on " << m->get_ino() << ", don't have it" << endl; + if (in) dir = in->dir; + if (!dir) { + + // did i see the warning yet? + if (stray_export_warnings.count(m->get_ino())) { + // i did, we're all good. + dout(7) << "handle_export_dir_notify on " << m->get_ino() << ", already saw warning." << endl; + } else { + // wait for it. + dout(7) << "handle_export_dir_notify on " << m->get_ino() << ", waiting for warning." << endl; + stray_export_notifies.insert(pair( m->get_ino(), m )); + return; + } + } else { // have i heard about it from the exporter yet? - if (!in->state_test(CINODE_STATE_EXPORTING)) { - dout(7) << "handle_export_dir_notify on " << *in << " haven't seen warning from old auth yet, waiting" << endl; - in->add_waiter(CINODE_WAIT_EXPORTWARNING, - new C_MDS_RetryMessage(mds, m)); + if (!dir->state_test(CDIR_STATE_EXPORTING)) { + dout(7) << "handle_export_dir_notify on " << *dir << " haven't seen warning from old auth yet, waiting" << endl; + dir->add_waiter(CDIR_WAIT_EXPORTWARNING, + new C_MDS_RetryMessage(mds, m)); return; } // ok! - dout(7) << "handle_export_dir_notify on " << *in << " new_auth " << m->get_new_auth() << endl; + dout(7) << "handle_export_dir_notify on " << *dir << " new_auth " << m->get_new_auth() << endl; // fix state. - in->state_clear(CINODE_STATE_EXPORTING); // exported! + dir->state_clear(CDIR_STATE_EXPORTING); // exported! // update dir_auth - if (in->authority(mds->get_cluster()) == m->get_new_auth()) { + if (in->authority() == m->get_new_auth()) { dout(7) << "handle_export_dir_notify on " << *in << ": inode auth is the same, setting dir_auth -1" << endl; - in->dir_auth = -1; + dir->dir_auth = -1; assert(!in->is_auth()); + assert(!dir->is_auth()); } else { - in->dir_auth = m->get_new_auth(); + dir->dir_auth = m->get_new_auth(); } - assert(in->dir_authority(mds->get_cluster()) != mds->get_nodeid()); - assert(!in->dir_is_auth(mds->get_nodeid())); + assert(dir->authority() != mds->get_nodeid()); + assert(!dir->is_auth()); // debug: verify subdirs if (g_conf.mds_verify_export_dirauth) { - dout(7) << "handle_export_dir_notify on " << *in << " checking " << m->num_subdirs() << " subdirs" << endl; + dout(7) << "handle_export_dir_notify on " << *dir << " checking " << m->num_subdirs() << " subdirs" << endl; for (list::iterator it = m->subdirs_begin(); it != m->subdirs_end(); it++) { CInode *idir = get_inode(*it); if (!idir) continue; // don't have it, don't care - dout(10) << "handle_export_dir_notify checking subdir " << *idir << " is auth " << idir->dir_auth << endl; - assert(idir != in); // base shouldn't be in subdir list - if (idir->dir_auth != CDIR_AUTH_PARENT) { - dout(7) << "*** weird value for dir_auth " << idir->dir_auth << " on " << *idir << ", should have been -1 probably??? ******************" << endl; + CDir *dir = idir->dir; + if (!dir) continue; + dout(10) << "handle_export_dir_notify checking subdir " << *dir << " is auth " << dir->dir_auth << endl; + assert(dir != dir); // base shouldn't be in subdir list + if (dir->dir_auth != CDIR_AUTH_PARENT) { + dout(7) << "*** weird value for dir_auth " << dir->dir_auth << " on " << *dir << ", should have been -1 probably??? ******************" << endl; assert(0); // bad news! - idir->dir_auth = -1; + //dir->dir_auth = -1; } - assert(idir->dir_authority(mds->get_cluster()) == m->get_new_auth()); + assert(dir->authority() == m->get_new_auth()); } } } @@ -4143,7 +4124,7 @@ vector MDCache::hack_add_file(string& fn, CInode *in) { } add_inode( in ); - link_inode( idir, file, in ); + link_inode( idir->dir, file, in ); vector trace; trace.push_back(idir); diff --git a/ceph/mds/MDCache.h b/ceph/mds/MDCache.h index 41ec0b68e11a5..89e634486a275 100644 --- a/ceph/mds/MDCache.h +++ b/ceph/mds/MDCache.h @@ -8,7 +8,8 @@ #include #include -#include "../include/types.h" +#include "include/types.h" +#include "include/filepath.h" #include "CInode.h" #include "CDentry.h" #include "CDir.h" @@ -24,6 +25,7 @@ class MExportDirNotify; class MExportDirNotifyAck; class MExportDirFinish; class MDiscover; +class MDiscoverReply; class MInodeGetReplica; class MInodeGetReplicaAck; class MInodeUpdate; @@ -67,7 +69,6 @@ class MDCache { MDS *mds; // root - bool opening_root; list waiting_for_root; // imports and exports @@ -82,7 +83,11 @@ class MDCache { // maps import_root_ino's to frozen dir ino's (with pending discovers) map > export_notify_ack_waiting; // nodes i am waiting to get export_notify_ack's from - map > export_proxy_inos; + map > export_proxy_inos; + map > export_proxy_dirinos; + + set stray_export_warnings; // warnings for export dirs i don't have open + map stray_export_notifies; // inoproxysets //list ino_proxy_sets; @@ -139,11 +144,11 @@ class MDCache { void remove_inode(CInode *in); void destroy_inode(CInode *in); - int link_inode( CInode *parent, string& dname, CInode *inode ); + int link_inode( CDir *dir, string& dname, CInode *inode ); void unlink_inode( CInode *inode ); int open_root(Context *c); - int path_traverse(string& path, + int path_traverse(filepath& path, vector& trace, Message *req, int onfail); @@ -153,7 +158,8 @@ class MDCache { int proc_message(Message *m); // -- replicas -- - int handle_discover(MDiscover *dis); + void handle_discover(MDiscover *dis); + void handle_discover_reply(MDiscoverReply *m); void handle_inode_get_replica(MInodeGetReplica *m); void handle_inode_get_replica_ack(MInodeGetReplicaAck *m); @@ -203,14 +209,15 @@ class MDCache { pchar& p, int from, CDir *import_root=0, - int *would_be_dir_auth); // need for normal import + int *would_be_dir_auth = 0); // need for normal import void handle_export_dir_prep(MExportDirPrep *m); void handle_export_dir(MExportDir *m); void import_dir_finish(CDir *dir); void handle_export_dir_finish(MExportDirFinish *m); - void import_dir_block(pchar& p, - CDir *containing_import, + void import_dir_block(crope& r, + int& off, int oldauth, + CDir *containing_import, CDir *import_root, list& imported_subdirs); void got_hashed_replica(CDir *import, diff --git a/ceph/mds/MDS.cc b/ceph/mds/MDS.cc index 44cd3edbc1a61..44d3625ad1591 100644 --- a/ceph/mds/MDS.cc +++ b/ceph/mds/MDS.cc @@ -355,7 +355,7 @@ int MDS::handle_client_request(MClientRequest *req) // operations that require existing files vector trace; - int r = mdcache->path_traverse(req->get_path(), trace, req, MDS_TRAVERSE_FORWARD); + int r = mdcache->path_traverse(req->get_filepath(), trace, req, MDS_TRAVERSE_FORWARD); if (r > 0) return 0; // delayed if (r == -ENOENT || r == -ENOTDIR || @@ -612,10 +612,9 @@ MClientReply *MDS::handle_client_readdir(MClientRequest *req, } // make sure i'm authoritative! - int dirauth = cur->dir_authority(mdcluster); // FIXME hashed, etc. - if (dirauth == whoami) { + if (cur->dir_is_auth()) { - if (!cur->dir) cur->dir = new CDir(cur, whoami); + cur->get_or_open_dir(this); assert(cur->dir->is_auth()); // frozen? @@ -659,22 +658,22 @@ MClientReply *MDS::handle_client_readdir(MClientRequest *req, } else { // fetch dout(10) << " incomplete dir contents for readdir on " << *cur->dir << ", fetching" << endl; - mdstore->fetch_dir(cur, new C_MDS_RetryMessage(this, req)); + mdstore->fetch_dir(cur->dir, new C_MDS_RetryMessage(this, req)); return 0; } } else { - if (cur->dir) assert(!cur->dir->is_auth()); - - if (dirauth < 0) { - assert(dirauth >= 0); - } else { - // forward to authority - dout(10) << " forwarding readdir to authority " << dirauth << endl; - messenger->send_message(req, - MSG_ADDR_MDS(dirauth), MDS_PORT_SERVER, - MDS_PORT_SERVER); - //mdcache->show_imports(); - } + int dirauth = cur->authority(); + if (cur->dir) + dirauth = cur->dir->authority(); + assert(dirauth >= 0); + assert(dirauth != whoami); + + // forward to authority + dout(10) << " forwarding readdir to authority " << dirauth << endl; + messenger->send_message(req, + MSG_ADDR_MDS(dirauth), MDS_PORT_SERVER, + MDS_PORT_SERVER); + //mdcache->show_imports(); return 0; } } @@ -730,7 +729,7 @@ MClientReply *MDS::handle_client_openwr(MClientRequest *req, { if (!cur->is_auth()) { if (!cur->is_softasync()) { - int auth = cur->authority(get_cluster()); + int auth = cur->authority(); assert(auth != whoami); dout(10) << "open (write) [replica] " << *cur << " on replica, fw to auth " << auth << endl; @@ -774,7 +773,7 @@ void MDS::handle_client_openwrc(MClientRequest *req) // see if file exists vector trace; - int r = mdcache->path_traverse(req->get_path(), trace, req, MDS_TRAVERSE_FORWARD); + int r = mdcache->path_traverse(req->get_filepath(), trace, req, MDS_TRAVERSE_FORWARD); if (r > 0) return; // delayed if (r < 0) { @@ -793,12 +792,12 @@ void MDS::handle_client_openwrc(MClientRequest *req) dout(7) << "handle_client_openwrc -ENOENT on target file, creating " << dname << endl; // verify i am authoritative for this dentry (should have fwd if not) - int auth = idir->dir->dentry_authority(dname, get_cluster()); + int auth = idir->dir->dentry_authority(dname); assert(auth == whoami); // create inode and link CInode *in = mdcache->create_inode(); - mdcache->link_inode( idir, dname, in ); + mdcache->link_inode( idir->dir, dname, in ); in->mark_dirty(); @@ -873,7 +872,7 @@ void MDS::handle_client_unlink(MClientRequest *req, // am i auth? if (!in->is_auth()) { // not auth; forward! - int auth = in->authority(get_cluster()); + int auth = in->authority(); dout(7) << "handle_client_unlink not auth for " << *in << ", fwd to " << auth << endl; messenger->send_message(req, MSG_ADDR_MDS(auth), MDS_PORT_SERVER, MDS_PORT_SERVER); diff --git a/ceph/mds/MDStore.cc b/ceph/mds/MDStore.cc index a6ff7093b0e8f..43a606059783c 100644 --- a/ceph/mds/MDStore.cc +++ b/ceph/mds/MDStore.cc @@ -52,30 +52,30 @@ class MDFetchDirContext : public Context { }; -bool MDStore::fetch_dir( CInode *in, +bool MDStore::fetch_dir( CDir *dir, Context *c ) { - dout(7) << "fetch_dir " << in->inode.ino << " context is " << c << endl; + dout(7) << "fetch_dir " << *dir << " context is " << c << endl; if (c) - in->dir->add_waiter(CDIR_WAIT_COMPLETE, c); + dir->add_waiter(CDIR_WAIT_COMPLETE, c); - assert(in->dir->is_auth()); + assert(dir->is_auth()); // already fetching? - if (in->dir->state_test(CDIR_STATE_FETCHING)) { - dout(7) << "already fetching " << in->inode.ino << "; waiting" << endl; + if (dir->state_test(CDIR_STATE_FETCHING)) { + dout(7) << "already fetching " << *dir << "; waiting" << endl; return true; } - in->dir->state_set(CDIR_STATE_FETCHING); + dir->state_set(CDIR_STATE_FETCHING); // create return context - MDFetchDirContext *fin = new MDFetchDirContext( this, in->ino() ); - - if (in->dir_is_hashed()) - do_fetch_dir( in, fin, mds->get_nodeid()); // hashed + MDFetchDirContext *fin = new MDFetchDirContext( this, dir->ino() ); + + if (dir->is_hashed()) + do_fetch_dir( dir, fin, mds->get_nodeid()); // hashed else - do_fetch_dir( in, fin ); // normal + do_fetch_dir( dir, fin ); // normal } bool MDStore::fetch_dir_2( int result, @@ -189,58 +189,58 @@ public: -bool MDStore::commit_dir( CInode *in, +bool MDStore::commit_dir( CDir *dir, Context *c ) { - assert(in->dir->is_auth() || - in->dir->is_hashed()); + assert(dir->is_auth() || + dir->is_hashed()); // already committing? - if (in->dir->state_test(CDIR_STATE_COMMITTING)) { + if (dir->state_test(CDIR_STATE_COMMITTING)) { // already mid-commit! - dout(7) << "commit_dir " << *in << " already mid-commit" << endl; - in->dir->add_waiter(CDIR_WAIT_COMMITTED, c); + dout(7) << "commit_dir " << *dir << " already mid-commit" << endl; + dir->add_waiter(CDIR_WAIT_COMMITTED, c); return false; } - if (!in->dir->can_auth_pin()) { + if (!dir->can_auth_pin()) { // something must be frozen up the hiearchy! - dout(7) << "commit_dir " << *in << " can't auth_pin, waiting" << endl; - in->dir->add_waiter(CDIR_WAIT_AUTHPINNABLE, - new C_MDS_CommitDirDelay(mds, in->inode.ino, c) ); + dout(7) << "commit_dir " << *dir << " can't auth_pin, waiting" << endl; + dir->add_waiter(CDIR_WAIT_AUTHPINNABLE, + new C_MDS_CommitDirDelay(mds, dir->ino(), c) ); return false; } // is it complete? - if (!in->dir->is_complete()) { - dout(7) << "commit_dir " << *in << " not complete, fetching first" << endl; + if (!dir->is_complete()) { + dout(7) << "commit_dir " << *dir << " not complete, fetching first" << endl; // fetch dir first - Context *fin = new MDFetchForCommitContext(this, in, c); - fetch_dir(in, fin); + Context *fin = new MDFetchForCommitContext(this, dir, c); + fetch_dir(dir, fin); return false; } // ok go - dout(7) << "commit_dir " << *in << " version " << in->dir->get_version() << endl; + dout(7) << "commit_dir " << *dir << " version " << dir->get_version() << endl; // add waiter - if (c) in->dir->add_waiter(CDIR_WAIT_COMMITTED, c); + if (c) dir->add_waiter(CDIR_WAIT_COMMITTED, c); // get continuation ready - MDCommitDirContext *fin = new MDCommitDirContext(this, in); + MDCommitDirContext *fin = new MDCommitDirContext(this, dir); // state - in->dir->state_set(CDIR_STATE_COMMITTING); - in->dir->set_committing_version(); + dir->state_set(CDIR_STATE_COMMITTING); + dir->set_committing_version(); - if (in->dir_is_hashed()) { + if (dir->is_hashed()) { // hashed - do_commit_dir( in, fin, mds->get_nodeid() ); + do_commit_dir( dir, fin, mds->get_nodeid() ); } else { // non-hashed - do_commit_dir( in, fin ); + do_commit_dir( dir, fin ); } } @@ -282,7 +282,7 @@ bool MDStore::commit_dir_2( int result, class MDDoCommitDirContext : public Context { protected: MDStore *ms; - CInode *in; + CDir *dir; Context *c; int hashcode; __uint64_t version; @@ -290,42 +290,42 @@ class MDDoCommitDirContext : public Context { public: crope buffer; - MDDoCommitDirContext(MDStore *ms, CInode *in, Context *c, int w) : Context() { + MDDoCommitDirContext(MDStore *ms, CDir *dir, Context *c, int w) : Context() { this->ms = ms; - this->in = in; + this->dir = dir; this->c = c; this->hashcode = w; - version = in->dir->get_version(); + version = dir->get_version(); } void finish(int result) { - ms->do_commit_dir_2( result, in, c, version, hashcode ); + ms->do_commit_dir_2( result, dir, c, version, hashcode ); } }; -void MDStore::do_commit_dir( CInode *in, +void MDStore::do_commit_dir( CDir *dir,, Context *c, int hashcode) { - assert(in->dir->is_auth()); + assert(dir->is_auth()); - dout(11) << "do_commit_dir hashcode " << hashcode << " dir " << *in << " version " << in->dir->get_version() << endl; + dout(11) << "do_commit_dir hashcode " << hashcode << " " << *dir << " version " << dir->get_version() << endl; // get continuation ready - MDDoCommitDirContext *fin = new MDDoCommitDirContext(this, in, c, hashcode); + MDDoCommitDirContext *fin = new MDDoCommitDirContext(this, dir, c, hashcode); // fill buffer __uint32_t num = 0; crope dirdata; - for (CDir_map_t::iterator it = in->dir->begin(); - it != in->dir->end(); + for (CDir_map_t::iterator it = dir->begin(); + it != dir->end(); it++) { if (hashcode >= 0) { - int dentryhashcode = mds->get_cluster()->hash_dentry( in->ino(), it->first ); + int dentryhashcode = mds->get_cluster()->hash_dentry( dir->ino(), it->first ); if (dentryhashcode != hashcode) continue; } @@ -341,7 +341,7 @@ void MDStore::do_commit_dir( CInode *in, // put inode in this dir version if (it->second->get_inode()->is_dirty()) { it->second->get_inode()->float_parent_dir_version(in->dir->get_version()); - dout(12) << " dirty inode " << in->get_parent_dir_version() << " " << *(in) << endl; + dout(12) << " dirty inode " << in->get_parent_dir_version() << " " << *(it->second->get_inode()) << endl; } num++; @@ -352,19 +352,19 @@ void MDStore::do_commit_dir( CInode *in, fin->buffer.append(dirdata); // pin inode - in->dir->auth_pin(); + dir->auth_pin(); // submit to osd int osd; object_t oid; if (hashcode >= 0) { // hashed - osd = mds->mdcluster->get_hashdir_meta_osd(in->inode.ino, hashcode); - oid = mds->mdcluster->get_hashdir_meta_oid(in->inode.ino, hashcode); + osd = mds->mdcluster->get_hashdir_meta_osd(dir->ino(), hashcode); + oid = mds->mdcluster->get_hashdir_meta_oid(dir->ino(), hashcode); } else { // normal - osd = mds->mdcluster->get_meta_osd(in->inode.ino); - oid = mds->mdcluster->get_meta_oid(in->inode.ino); + osd = mds->mdcluster->get_meta_osd(dir->ino()); + oid = mds->mdcluster->get_meta_oid(dir->ino()); } mds->osd_write( osd, oid, @@ -376,16 +376,16 @@ void MDStore::do_commit_dir( CInode *in, void MDStore::do_commit_dir_2( int result, - CInode *in, + CDir *dir, Context *c, __uint64_t committed_version, int hashcode ) { - dout(11) << "do_commit_dir_2 hashcode " << hashcode << " dir " << *in << endl; + dout(11) << "do_commit_dir_2 hashcode " << hashcode << " " << *dir << endl; // mark inodes clean too (if we committed them!) - for (CDir_map_t::iterator it = in->dir->begin(); - it != in->dir->end(); + for (CDir_map_t::iterator it = dir->begin(); + it != dir->end(); it++) { CInode *in = it->second->get_inode(); @@ -409,7 +409,7 @@ void MDStore::do_commit_dir_2( int result, } // unpin - in->dir->auth_unpin(); + dir->auth_unpin(); // finish if (c) { @@ -450,27 +450,27 @@ class MDDoFetchDirContext : public Context { }; -void MDStore::do_fetch_dir( CInode *in, +void MDStore::do_fetch_dir( CDir *dir, Context *c, int hashcode) { - dout(11) << "fetch_hashed_dir hashcode " << hashcode << " dir " << in->inode.ino << " context is " << c << endl; + dout(11) << "fetch_hashed_dir hashcode " << hashcode << " " << *dir << " context is " << c << endl; // create return context - MDDoFetchDirContext *fin = new MDDoFetchDirContext( this, in->ino(), c, hashcode ); + MDDoFetchDirContext *fin = new MDDoFetchDirContext( this, dir->ino(), c, hashcode ); // issue osd read int osd; object_t oid; if (hashcode >= 0) { // hashed - osd = mds->mdcluster->get_hashdir_meta_osd(in->inode.ino, hashcode); - oid = mds->mdcluster->get_hashdir_meta_oid(in->inode.ino, hashcode); + osd = mds->mdcluster->get_hashdir_meta_osd(dir->ino(), hashcode); + oid = mds->mdcluster->get_hashdir_meta_oid(dir->ino(), hashcode); } else { // normal - osd = mds->mdcluster->get_meta_osd(in->inode.ino); - oid = mds->mdcluster->get_meta_oid(in->inode.ino); + osd = mds->mdcluster->get_meta_osd(dir->ino()); + oid = mds->mdcluster->get_meta_oid(dir->ino()); } mds->osd_read( osd, oid, diff --git a/ceph/mds/MDStore.h b/ceph/mds/MDStore.h index a3c701e178890..d653e23b9260e 100644 --- a/ceph/mds/MDStore.h +++ b/ceph/mds/MDStore.h @@ -22,28 +22,28 @@ class MDStore { } // basic interface (normal or unhashed) - bool fetch_dir( CInode *in, + bool fetch_dir( CDir *dir, Context *c ); bool fetch_dir_2( int result, inodeno_t ino ); - bool commit_dir( CInode *in, + bool commit_dir( CDir *dir, Context *c ); bool commit_dir_2( int result, - CInode *in, + CDir *dir, __uint64_t committed_version ); // low level committer - void do_commit_dir( CInode *in, + void do_commit_dir( CDir *dir, Context *c, int hashcode = -1); void do_commit_dir_2( int result, - CInode *in, + CDir *dir, Context *c, __uint64_t version, int hashcode ); - void do_fetch_dir( CInode *in, + void do_fetch_dir( CDir *dir, Context *c, int hashcode = -1); void do_fetch_dir_2( int result, diff --git a/ceph/mds/events/EInodeUnlink.h b/ceph/mds/events/EInodeUnlink.h index 8899a05e416fc..57dfc80d065e8 100644 --- a/ceph/mds/events/EInodeUnlink.h +++ b/ceph/mds/events/EInodeUnlink.h @@ -14,24 +14,24 @@ and been in progress when we asked. */ class C_EIU_VerifyDirCommit : public Context { MDS *mds; - CInode *idir; + CDir *dir; Context *fin; __uint64_t version; public: - C_EIU_VerifyDirCommit(MDS *mds, CInode *idir, __uint64_t version, Context *fin) { + C_EIU_VerifyDirCommit(MDS *mds, CDir *dir, __uint64_t version, Context *fin) { this->mds = mds; - this->idir = idir; + this->dir = dir; this->version = version; this->fin = fin; } virtual void finish(int r) { - if (idir->dir->get_version() <= version) { + if (dir->get_version() <= version) { // still dirty - mds->mdstore->commit_dir(idir, + mds->mdstore->commit_dir(dir, new C_EIU_VerifyDirCommit(mds, - idir, + dir, version, fin)); return; @@ -88,16 +88,16 @@ class EInodeUnlink : public LogEvent { virtual void retire(MDS *mds, Context *c) { // commit my containing directory - CInode *idir = mds->mdcache->get_inode(dir_ino); - assert(idir); + CDir *dir = mds->mdcache->get_inode(dir_ino)->dir; + assert(dir); CInode *in = mds->mdcache->get_inode(ino); assert(in); // okay! - dout(7) << "commiting containing dir " << *idir << " which has unlinked inode " << *in << endl; - mds->mdstore->commit_dir(idir, + dout(7) << "commiting containing dir " << *dir << " which has unlinked inode " << *in << endl; + mds->mdstore->commit_dir(dir, new C_EIU_VerifyDirCommit(mds, - idir, + dir, in->get_parent_dir_version(), c)); } diff --git a/ceph/mds/events/EInodeUpdate.h b/ceph/mds/events/EInodeUpdate.h index 5c57161955062..2145dba7fefbc 100644 --- a/ceph/mds/events/EInodeUpdate.h +++ b/ceph/mds/events/EInodeUpdate.h @@ -29,12 +29,12 @@ class C_EIU_VerifyInodeUpdate : public Context { CInode *in = mds->mdcache->get_inode(ino); if (in) { // if it's mine, dirty, and the same version, commit - if (in->authority(mds->get_cluster()) == mds->get_nodeid() && // mine + if (in->authority() == mds->get_nodeid() && // mine in->is_dirty() && // dirty in->get_version() == version) { // same version that i have to deal with dout(7) << "ARGH, did EInodeUpdate commit but inode " << *in << " is still dirty" << endl; // damnit - mds->mdstore->commit_dir(in->get_parent_inode(), + mds->mdstore->commit_dir(in->get_parent_dir(), new C_EIU_VerifyInodeUpdate(mds, in->ino(), in->get_version(), @@ -98,7 +98,7 @@ class EInodeUpdate : public LogEvent { // commit my containing directory CInode *in = mds->mdcache->get_inode(inode.ino); assert(in); - CInode *parent = in->get_parent_inode(); + CDir *parent = in->get_parent_dir(); if (parent) { // okay! diff --git a/ceph/messages/MClientRequest.h b/ceph/messages/MClientRequest.h index b586532f60e65..0990988deacd5 100644 --- a/ceph/messages/MClientRequest.h +++ b/ceph/messages/MClientRequest.h @@ -2,6 +2,7 @@ #define __MCLIENTREQUEST_H #include "include/Message.h" +#include "include/filepath.h" #include "mds/MDS.h" typedef struct { @@ -14,7 +15,7 @@ typedef struct { class MClientRequest : public Message { MClientRequest_st st; - string path; + filepath path; public: MClientRequest() {} @@ -26,18 +27,20 @@ class MClientRequest : public Message { } virtual char *get_type_name() { return "creq"; } - void set_path(string& p) { path = p; } + void set_path(string& p) { path.set_path(p); } void set_ino(inodeno_t ino) { st.ino = ino; } long get_tid() { return st.tid; } int get_op() { return st.op; } int get_client() { return st.client; } inodeno_t get_ino() { return st.ino; } - string& get_path() { return path; } + string& get_path() { return path.get_path(); } + filepath& get_filepath() { return path; } + virtual int decode_payload(crope s) { s.copy(0, sizeof(st), (char*)&st); - path = s.c_str() + sizeof(st); + path.set_path( s.c_str() + sizeof(st) ); return 0; } diff --git a/ceph/messages/MDiscover.h b/ceph/messages/MDiscover.h index d75a0c98d8738..1e3b08f6733cf 100644 --- a/ceph/messages/MDiscover.h +++ b/ceph/messages/MDiscover.h @@ -15,7 +15,7 @@ class MDiscover : public Message { inodeno_t base_ino; // 0 -> none, want root bool want_base_dir; - dirpath want; // ... [/]need/this/stuff + filepath want; // ... [/]need/this/stuff public: int get_asker() { return asker; } diff --git a/ceph/messages/MDiscoverReply.h b/ceph/messages/MDiscoverReply.h index f1de8c52a8ca2..a9f93392c3382 100644 --- a/ceph/messages/MDiscoverReply.h +++ b/ceph/messages/MDiscoverReply.h @@ -20,9 +20,9 @@ class MDiscoverReply : public Message { // inode [ + ... ], base_ino = 0 : discover base_ino=0, start w/ root ino // dentry + inode [ + ... ] : discover want_base_dir=false // (dir + dentry + inode) + : discover want_base_dir=true - vector dirs; // not inode-aligned if no_base_dir = true. - filepath path; // not inode-aligned in no_base_dentry = true - vector inodes; + vector dirs; // not inode-aligned if no_base_dir = true. + filepath path; // not inode-aligned in no_base_dentry = true + vector inodes; public: // accessors @@ -38,11 +38,13 @@ class MDiscoverReply : public Message { } return false; } + string& get_path() { return path.get_path(); } // these index _arguments_ are aligned to the inodes. - CDirDiscover* get_dir(int n) { return dirs[n + no_base_dir]; } + CDirDiscover& get_dir(int n) { return *(dirs[n + no_base_dir]); } string& get_dentry(int n) { return path[n + no_base_dentry]; } - CInodeDiscover* get_inode(int n) { return inodes[n]; } + CInodeDiscover& get_inode(int n) { return *(inodes[n]); } + inodeno_t get_ino(int n) { return inodes[n]->get_ino(); } // cons MDiscoverReply() {} @@ -51,6 +53,16 @@ class MDiscoverReply : public Message { this->base_ino = base_ino; no_base_dir = no_base_dentry = false; } + ~MDiscoverReply() { + for (vector::iterator it = dirs.begin(); + it != dirs.end(); + it++) + delete *it; + for (vector::iterator it = inodes.begin(); + it != inodes.end(); + it++) + delete *it; + } virtual char *get_type_name() { return "DisR"; } // builders @@ -63,14 +75,14 @@ class MDiscoverReply : public Message { path.add_dentry(dn); } - void add_inode(CInodeDiscover& din) { + void add_inode(CInodeDiscover* din) { if (inodes.empty() && dirs.empty()) no_base_dir = true; - if (inodes.empty() && dentries.empty()) no_base_dentry = true; - inodes.add( din ); + if (inodes.empty() && path.depth() == 0) no_base_dentry = true; + inodes.push_back( din ); } - void add_dir(CDirDiscover& dir) { - dirs.add( dir ); + void add_dir(CDirDiscover* dir) { + dirs.push_back( dir ); } @@ -88,8 +100,8 @@ class MDiscoverReply : public Message { r.copy(off, sizeof(int), (char*)&n); off += sizeof(int); for (int i=0; i_unrope(r, off); } // filepath @@ -99,8 +111,8 @@ class MDiscoverReply : public Message { r.copy(off, sizeof(int), (char*)&n); off += sizeof(int); for (int i=0; i_unrope(r, off); } return off; } @@ -112,18 +124,18 @@ class MDiscoverReply : public Message { int n = dirs.size(); r.append((char*)&n, sizeof(int)); - for (vector::iterator it = dirs.begin(); + for (vector::iterator it = dirs.begin(); it != dirs.end(); it++) - r.append((*it)._rope()); + r.append((*it)->_rope()); r.append(path._rope()); n = inodes.size(); - for (vector::iterator it = dirs.begin(); - it != dirs.end(); + for (vector::iterator it = inodes.begin(); + it != inodes.end(); it++) - r.append((*it)._rope()); + r.append((*it)->_rope()); return r; } diff --git a/ceph/msg/Message.h b/ceph/msg/Message.h index 96a7da5cad429..f6751254f34c5 100644 --- a/ceph/msg/Message.h +++ b/ceph/msg/Message.h @@ -25,6 +25,7 @@ #define MSG_MDS_INODEUPDATE 120 #define MSG_MDS_DIRUPDATE 121 #define MSG_MDS_INODEEXPIRE 122 +#define MSG_MDS_DIREXPIRE 123 #define MSG_MDS_EXPORTDIRPREP 150 #define MSG_MDS_EXPORTDIRPREPACK 151 diff --git a/ceph/msg/Messenger.cc b/ceph/msg/Messenger.cc index d769375c26a88..2251c1806c803 100644 --- a/ceph/msg/Messenger.cc +++ b/ceph/msg/Messenger.cc @@ -21,6 +21,7 @@ using namespace std; #include "messages/MDirUpdate.h" #include "messages/MDiscover.h" +#include "messages/MDiscoverReply.h" #include "messages/MExportDirPrep.h" #include "messages/MExportDirPrepAck.h" @@ -34,6 +35,7 @@ using namespace std; #include "messages/MInodeUpdate.h" #include "messages/MInodeExpire.h" +#include "messages/MDirExpire.h" #include "messages/MInodeSyncStart.h" #include "messages/MInodeSyncAck.h" @@ -98,6 +100,9 @@ decode_message(crope& ser) case MSG_MDS_DISCOVER: m = new MDiscover(); break; + case MSG_MDS_DISCOVERREPLY: + m = new MDiscoverReply(); + break; case MSG_MDS_EXPORTDIR: m = new MExportDir(); @@ -124,7 +129,7 @@ decode_message(crope& ser) break; case MSG_MDS_EXPORTDIRWARNING: - m = new MExportDirPrepWarning(); + m = new MExportDirWarning(); break; case MSG_MDS_HEARTBEAT: @@ -139,6 +144,10 @@ decode_message(crope& ser) m = new MInodeExpire(); break; + case MSG_MDS_DIREXPIRE: + m = new MDirExpire(); + break; + case MSG_MDS_INODESYNCSTART: m = new MInodeSyncStart(); break; -- 2.39.5