From a490fe6f5757585107ad67ad01e1d1ffd73e13e2 Mon Sep 17 00:00:00 2001 From: sage Date: Mon, 4 Oct 2004 22:19:59 +0000 Subject: [PATCH] ... git-svn-id: https://ceph.svn.sf.net/svnroot/ceph@107 29311d96-e01e-0410-9327-a35deaab8ce9 --- ceph/doc/caching.txt | 71 ++++++++++++++++ ceph/mds/CDir.cc | 9 +- ceph/mds/CDir.h | 193 ++++++++++++++++++++++++++++++++++++++++--- ceph/mds/CInode.cc | 20 ++++- ceph/mds/CInode.h | 81 +++++++++++++----- 5 files changed, 340 insertions(+), 34 deletions(-) create mode 100644 ceph/doc/caching.txt diff --git a/ceph/doc/caching.txt b/ceph/doc/caching.txt new file mode 100644 index 0000000000000..e9045e812558f --- /dev/null +++ b/ceph/doc/caching.txt @@ -0,0 +1,71 @@ + + +AUTHORITY + +The authority maintains a list of what nodes cache each inode. +Additionally, each replica is assigned a serial (normally 0) to +disambiguate multiple replicas of the same item (see below). + + set cached_by; + map cached_by_serial; + +The cached_by set _always_ includes all nodes that cache the +partcuarly inode, but may additionally include nodes that used to +cache it but no longer do. In those cases, an expire message should +be in transit. + + +REPLICA + +The replica maintains a notion of who it believes is the authority for +each replicated inode. There are two possibilities: + + - Ordinarily, this notion is correct. + - If the part of the file system in question was recently exported to + a new MDS, the inodes old authority is acting as a CACHEPROXY, + and will forward relevant messages on to the authority. + +When a repica is expired from cache, and expire is sent to the +authority. The expire incudes the serial number issued when the +replica was originally created. + + +Exports are tricky: + +- The old authority suddenly becomes a replica. It's serial is well + defined. It also becomes a CACHEPROXY, which means its cached_by + remains defined (with an alternate meaning!). While a proxy, the + node will forward relevant messages from the replica to the + authority (but not the other way around--the authority knows all + replicas). + +- Once the export is acked, the old authority sends a + message to the replica notifying it of the new authority. As soon + as all replicas acknowedge receipt of this notice, the old authority + can cease CACHEPROXY responsibilities and become a regular replica. + At this point it's cached_by is no longer defined. + + + + + + + +- Replicas always know who the authority for the inode is, OR they + know prior owner acting as a CACHEPROXY. (They don't know which it + is.) + +Because the authority always knows who caches an item, it can +confidently send updates to replicas for locking, invalidating, etc. + + +Expiration: + +When a replica is expired from cache, an expire is sent to the +authority. If the receiving node is the authority, it simply removes +the node from the cached_by list. + +If the receiving node is not the replica, it is acting as a CACHEPROXY +(because it recently exported the data). + + diff --git a/ceph/mds/CDir.cc b/ceph/mds/CDir.cc index e0ca25fc10e48..07b6dfb931359 100644 --- a/ceph/mds/CDir.cc +++ b/ceph/mds/CDir.cc @@ -146,6 +146,7 @@ what DIR state is encoded when - dir open / discover nonce + dir_auth dir_rep/by - dir update @@ -423,8 +424,10 @@ int CDir::dentry_authority(const string& dn ) // auth pins void CDir::auth_pin() { - get(CDIR_PIN_AUTHPIN + auth_pins); + if (auth_pins == 0) + get(CDIR_PIN_AUTHPIN); auth_pins++; + dout(7) << "auth_pin on " << *this << " count now " << auth_pins << " + " << nested_auth_pins << endl; inode->nested_auth_pins++; @@ -434,8 +437,10 @@ void CDir::auth_pin() { void CDir::auth_unpin() { auth_pins--; - put(CINODE_PIN_DAUTHPIN + auth_pins); + if (auth_pins == 0) + put(CINODE_PIN_DAUTHPIN); assert(auth_pins >= 0); + dout(7) << "auth_unpin on " << *this << " count now " << auth_pins << " + " << nested_auth_pins << endl; // pending freeze? diff --git a/ceph/mds/CDir.h b/ceph/mds/CDir.h index 8e14c7bf2047a..a73b371f01645 100644 --- a/ceph/mds/CDir.h +++ b/ceph/mds/CDir.h @@ -80,17 +80,30 @@ class Context; // pins -#define CDIR_PIN_CHILD 20000 -#define CDIR_PIN_OPENED 20001 // open by another node -#define CDIR_PIN_HASHED 20002 // hashed -#define CDIR_PIN_WAITER 20003 // waiter(s) - -#define CDIR_PIN_IMPORT 20010 -#define CDIR_PIN_EXPORT 20011 -#define CDIR_PIN_FREEZE 20012 -#define CDIR_PIN_PROXY 20013 // auth just changed. - -#define CDIR_PIN_AUTHPIN 30000 +#define CDIR_PIN_CHILD 0 +#define CDIR_PIN_OPENED 1 // open by another node +#define CDIR_PIN_HASHED 2 // hashed +#define CDIR_PIN_WAITER 3 // waiter(s) + +#define CDIR_PIN_IMPORT 4 +#define CDIR_PIN_EXPORT 5 +#define CDIR_PIN_FREEZE 6 +#define CDIR_PIN_PROXY 7 // auth just changed. + +#define CDIR_PIN_AUTHPIN 8 + +#define CDIR_NUM_PINS 9 +static char cdir_pin_names[CDIR_NUM_PINS] = { + "child", + "opened", + "hashed", + "waiter", + "import", + "export", + "freeze", + "proxy", + "authpin" +}; @@ -193,6 +206,9 @@ class CDir { friend class MDiscover; friend class MDBalancer; + friend class CDirDiscover; + friend class CDirExport; + public: CDir(CInode *in, MDS *mds); @@ -395,4 +411,159 @@ class CDir { +// -- encoded state -- + +// discover + +class CDirDiscover { + inodeno_t ino; + int nonce; + int dir_auth; + set rep_by; + + CDirDiscover(CDir *dir, int nonce) { + ino = dir->ino(); + this->nonce = nonce; + dir_auth = dir->dir_auth; + rep_by = dir->dir_rep_by; + } + + crope _rope() { + crope r; + + r.append((char*)&ino, sizeof(ino)); + r.append((char*)&nonce, sizeof(nonce)); + r.append((char*)&dir_auth, sizeof(dir_auth)); + + int nrep_by = rep_by.size(); + r.append((char*)&nrep_by, sizeof(nrep_by)); + + // rep_by + for (set::iterator it = rep_by.begin(); + it != rep_by.end(); + it++) { + int m = *it; + r.append((char*)&m, sizeof(int)); + } + + return r; + } + + int _unrope(crope s, int off = 0) { + s.copy(off, sizeof(ino), (char*)&ino); + off += sizeof(ino); + s.copy(off, sizeof(nonce), (char*)&nonce); + off += sizeof(nonce); + s.copy(off, sizeof(dir_auth), (char*)&dir_auth); + off += sizeof(dir_auth); + + int nrep_by; + s.copy(off, sizeof(int), (char*)&nrep_by); + off += sizeof(int); + + // open_by + for (int i=0; i open_by_nonce; + set rep_by; + + CDirExport(CDir *dir) { + st.ino = dir->ino(); + st.nitems = dir->nitems; + st.version = dir->version; + st.state = dir->state; + st.popularity = dir->popularity; + st.dir_auth = dir->dir_auth; + st.dir_rep = dir->dir_rep; + + rep_by = dir->dir_rep_by; + open_by_nonce = dir->open_by_nonce; + } + + crope _rope() { + crope r; + + st.nrep_by = rep_by.size(); + st.nopen_by = open_by_nonce.size(); + r.append((char*)&st, sizeof(st)); + + // open_by + for (map::iterator it = open_by_nonce.begin(); + it != open_by_nonce.end(); + it++) { + int m = it->first; + r.append((char*)&m, sizeof(int)); + int n = it->second; + r.append((char*)&n, sizeof(int)); + } + + // rep_by + for (set::iterator it = rep_by.begin(); + it != rep_by.end(); + it++) { + int m = *it; + r.append((char*)&m, sizeof(int)); + } + + return r; + } + + int _unrope(crope s, int off = 0) { + s.copy(off, sizeof(st), (char*)&st); + off += sizeof(st); + + // open_by + for (int i=0; i(m,n)); + } + + // rep_by + for (int i=0; i::iterator it = in.get_ref_set().begin(); + it != in.get_ref_set().end(); + it++) + if (*it < CINODE_PIN_NUM) + out << " " << cinode_pin_names[*it]; + else + out << " " << *it; + } out << "]"; return out; } @@ -359,17 +369,23 @@ bool CInode::can_auth_pin() { } void CInode::auth_pin() { - get(CINODE_PIN_AUTHPIN + auth_pins); + if (auth_pins == 0) + get(CINODE_PIN_AUTHPIN); auth_pins++; + dout(7) << "auth_pin on " << *this << " count now " << auth_pins << " + " << nested_auth_pins << endl; + if (parent) parent->dir->adjust_nested_auth_pins( 1 ); } void CInode::auth_unpin() { auth_pins--; + if (auth_pins == 0) + put(CINODE_PIN_AUTHPIN); + dout(7) << "auth_unpin on " << *this << " count now " << auth_pins << " + " << nested_auth_pins << endl; - put(CINODE_PIN_AUTHPIN + auth_pins); + if (parent) parent->dir->adjust_nested_auth_pins( -1 ); } diff --git a/ceph/mds/CInode.h b/ceph/mds/CInode.h index e448b292215fb..6c518dd967aa7 100644 --- a/ceph/mds/CInode.h +++ b/ceph/mds/CInode.h @@ -33,17 +33,31 @@ using namespace std; */ // pins for keeping an item in cache (and debugging) -#define CINODE_PIN_DIR 10000 -#define CINODE_PIN_CACHED 10001 -#define CINODE_PIN_DIRTY 10002 // must flush -#define CINODE_PIN_PROXY 10004 // can't expire yet -#define CINODE_PIN_WAITER 10005 // waiter - -#define CINODE_PIN_OPENRD 10020 -#define CINODE_PIN_OPENWR 10021 -#define CINODE_PIN_UNLINKING 10022 +#define CINODE_PIN_DIR 0 +#define CINODE_PIN_CACHED 1 +#define CINODE_PIN_DIRTY 2 // must flush +#define CINODE_PIN_PROXY 3 // can't expire yet +#define CINODE_PIN_WAITER 4 // waiter + +#define CINODE_PIN_OPENRD 5 +#define CINODE_PIN_OPENWR 6 +#define CINODE_PIN_UNLINKING 7 + +#define CINODE_PIN_AUTHPIN 8 + +#define CINODE_NUM_PINS 9 +static char *cinode_pin_names[CINODE_NUM_PINS] = { + "dir", + "cached", + "dirty", + "proxy", + "waiter", + "openrd", + "openwr", + "unlinking", + "authpin" +}; -#define CINODE_PIN_AUTHPIN 30000 //#define CINODE_PIN_SYNCBYME 70000 @@ -283,6 +297,7 @@ class CInode : LRUObject { // -- cached_by -- to be used ONLY when we're authoritative or cacheproxy bool is_cached_by_anyone() { return !cached_by.empty(); } bool is_cached_by(int mds) { return cached_by.count(mds); } + int num_cached_by() { return cached_by.size(); } // cached_by_add returns a nonce int cached_by_add(int mds) { if (is_cached_by(mds)) { // already had it? @@ -414,6 +429,8 @@ class CInode : LRUObject { // -- reference counting -- + bool is_pinned() { return ref > 0; } + set& get_ref_set() { return ref_set; } void put(int by) { if (ref == 0 || ref_set.count(by) != 1) { dout(7) << " bad put " << *this << " by " << by << " was " << ref << " (" << ref_set << ")" << endl; @@ -462,7 +479,9 @@ class CInode : LRUObject { -// encoded state +// -- encoded state + +// discover class CInodeDiscover { @@ -472,7 +491,7 @@ class CInodeDiscover { bool is_softasync; bool is_lockbyauth; - + CInodeDiscover() {} CInodeDiscover(CInode *in) { inode = in->inode; replica_nonce = in->get_replica_nonce(); @@ -492,7 +511,7 @@ class CInodeDiscover { } int _unrope(crope s, int off = 0) { - s.copy(0,sizeof(inode_t), (char*)&inode); + s.copy(off,sizeof(inode_t), (char*)&inode); off += sizeof(inode_t); s.copy(off, sizeof(int), (char*)&replica_nonce); off += sizeof(int); @@ -508,6 +527,8 @@ class CInodeDiscover { }; +// export + typedef struct { inode_t inode; __uint64_t version; @@ -518,13 +539,13 @@ typedef struct { int ncached_by; // int pairs follow } CInodeExport_st; - class CInodeExport { CInodeExport_st st; set cached_by; map cached_by_nonce; + CInodeExport() {} CInodeExport(CInode *in) { st.inode = in->inode; st.version = in->get_version(); @@ -535,17 +556,39 @@ class CInodeExport { cached_by_nonce = in->get_cached_by_nonce(); } - crope _rope() { crope r; - + st.ncached_by = cached_by.size(); + r.append((char*)&st, sizeof(st)); + + // cached_by + nonce + for (map::iterator it = cached_by_nonce.begin(); + it != cached_by_nonce.end(); + it++) { + int m = it->first; + r.append((char*)&m, sizeof(int)); + int n = it->second; + r.append((char*)&n, sizeof(int)); + } + return r; } - int _unrope(crope s, int off = 0) { + int _unrope(crope s, int off = 0) { + s.copy(off, sizeof(st), (char*)&st); + off += sizeof(st); + + for (int i=0; i(m,n)); + } + return off; } - - }; -- 2.39.5