--- /dev/null
+
+
+AUTHORITY
+
+The authority maintains a list of what nodes cache each inode.
+Additionally, each replica is assigned a serial (normally 0) to
+disambiguate multiple replicas of the same item (see below).
+
+ set<int> cached_by;
+ map<int, int> cached_by_serial;
+
+The cached_by set _always_ includes all nodes that cache the
+partcuarly inode, but may additionally include nodes that used to
+cache it but no longer do. In those cases, an expire message should
+be in transit.
+
+
+REPLICA
+
+The replica maintains a notion of who it believes is the authority for
+each replicated inode. There are two possibilities:
+
+ - Ordinarily, this notion is correct.
+ - If the part of the file system in question was recently exported to
+ a new MDS, the inodes old authority is acting as a CACHEPROXY,
+ and will forward relevant messages on to the authority.
+
+When a repica is expired from cache, and expire is sent to the
+authority. The expire incudes the serial number issued when the
+replica was originally created.
+
+
+Exports are tricky:
+
+- The old authority suddenly becomes a replica. It's serial is well
+ defined. It also becomes a CACHEPROXY, which means its cached_by
+ remains defined (with an alternate meaning!). While a proxy, the
+ node will forward relevant messages from the replica to the
+ authority (but not the other way around--the authority knows all
+ replicas).
+
+- Once the export is acked, the old authority sends a
+ message to the replica notifying it of the new authority. As soon
+ as all replicas acknowedge receipt of this notice, the old authority
+ can cease CACHEPROXY responsibilities and become a regular replica.
+ At this point it's cached_by is no longer defined.
+
+
+
+
+
+
+
+- Replicas always know who the authority for the inode is, OR they
+ know prior owner acting as a CACHEPROXY. (They don't know which it
+ is.)
+
+Because the authority always knows who caches an item, it can
+confidently send updates to replicas for locking, invalidating, etc.
+
+
+Expiration:
+
+When a replica is expired from cache, an expire is sent to the
+authority. If the receiving node is the authority, it simply removes
+the node from the cached_by list.
+
+If the receiving node is not the replica, it is acting as a CACHEPROXY
+(because it recently exported the data).
+
+
- dir open / discover
nonce
+ dir_auth
dir_rep/by
- dir update
// auth pins
void CDir::auth_pin() {
- get(CDIR_PIN_AUTHPIN + auth_pins);
+ if (auth_pins == 0)
+ get(CDIR_PIN_AUTHPIN);
auth_pins++;
+
dout(7) << "auth_pin on " << *this << " count now " << auth_pins << " + " << nested_auth_pins << endl;
inode->nested_auth_pins++;
void CDir::auth_unpin() {
auth_pins--;
- put(CINODE_PIN_DAUTHPIN + auth_pins);
+ if (auth_pins == 0)
+ put(CINODE_PIN_DAUTHPIN);
assert(auth_pins >= 0);
+
dout(7) << "auth_unpin on " << *this << " count now " << auth_pins << " + " << nested_auth_pins << endl;
// pending freeze?
// pins
-#define CDIR_PIN_CHILD 20000
-#define CDIR_PIN_OPENED 20001 // open by another node
-#define CDIR_PIN_HASHED 20002 // hashed
-#define CDIR_PIN_WAITER 20003 // waiter(s)
-
-#define CDIR_PIN_IMPORT 20010
-#define CDIR_PIN_EXPORT 20011
-#define CDIR_PIN_FREEZE 20012
-#define CDIR_PIN_PROXY 20013 // auth just changed.
-
-#define CDIR_PIN_AUTHPIN 30000
+#define CDIR_PIN_CHILD 0
+#define CDIR_PIN_OPENED 1 // open by another node
+#define CDIR_PIN_HASHED 2 // hashed
+#define CDIR_PIN_WAITER 3 // waiter(s)
+
+#define CDIR_PIN_IMPORT 4
+#define CDIR_PIN_EXPORT 5
+#define CDIR_PIN_FREEZE 6
+#define CDIR_PIN_PROXY 7 // auth just changed.
+
+#define CDIR_PIN_AUTHPIN 8
+
+#define CDIR_NUM_PINS 9
+static char cdir_pin_names[CDIR_NUM_PINS] = {
+ "child",
+ "opened",
+ "hashed",
+ "waiter",
+ "import",
+ "export",
+ "freeze",
+ "proxy",
+ "authpin"
+};
friend class MDiscover;
friend class MDBalancer;
+ friend class CDirDiscover;
+ friend class CDirExport;
+
public:
CDir(CInode *in, MDS *mds);
+// -- encoded state --
+
+// discover
+
+class CDirDiscover {
+ inodeno_t ino;
+ int nonce;
+ int dir_auth;
+ set<int> rep_by;
+
+ CDirDiscover(CDir *dir, int nonce) {
+ ino = dir->ino();
+ this->nonce = nonce;
+ dir_auth = dir->dir_auth;
+ rep_by = dir->dir_rep_by;
+ }
+
+ crope _rope() {
+ crope r;
+
+ r.append((char*)&ino, sizeof(ino));
+ r.append((char*)&nonce, sizeof(nonce));
+ r.append((char*)&dir_auth, sizeof(dir_auth));
+
+ int nrep_by = rep_by.size();
+ r.append((char*)&nrep_by, sizeof(nrep_by));
+
+ // rep_by
+ for (set<int>::iterator it = rep_by.begin();
+ it != rep_by.end();
+ it++) {
+ int m = *it;
+ r.append((char*)&m, sizeof(int));
+ }
+
+ return r;
+ }
+
+ int _unrope(crope s, int off = 0) {
+ s.copy(off, sizeof(ino), (char*)&ino);
+ off += sizeof(ino);
+ s.copy(off, sizeof(nonce), (char*)&nonce);
+ off += sizeof(nonce);
+ s.copy(off, sizeof(dir_auth), (char*)&dir_auth);
+ off += sizeof(dir_auth);
+
+ int nrep_by;
+ s.copy(off, sizeof(int), (char*)&nrep_by);
+ off += sizeof(int);
+
+ // open_by
+ for (int i=0; i<nrep_by; i++) {
+ int m;
+ s.copy(off, sizeof(int), (char*)&m);
+ off += sizeof(int);
+ rep_by.insert(m);
+ }
+
+ return off;
+ }
+
+};
+
+
+// export
+
+typedef struct {
+ inodeno_t ino;
+ __uint64_t nitems;
+ __uint64_t version;
+ unsigned state;
+ DecayCounter popularity;
+ int dir_auth;
+ int dir_rep;
+ int nopen_by;
+ int nrep_by;
+ // ints follow
+} CDirExport_st;
+
+class CDirExport {
+
+ CDirExport_st st;
+ map<int,int> open_by_nonce;
+ set<int> rep_by;
+
+ CDirExport(CDir *dir) {
+ st.ino = dir->ino();
+ st.nitems = dir->nitems;
+ st.version = dir->version;
+ st.state = dir->state;
+ st.popularity = dir->popularity;
+ st.dir_auth = dir->dir_auth;
+ st.dir_rep = dir->dir_rep;
+
+ rep_by = dir->dir_rep_by;
+ open_by_nonce = dir->open_by_nonce;
+ }
+
+ crope _rope() {
+ crope r;
+
+ st.nrep_by = rep_by.size();
+ st.nopen_by = open_by_nonce.size();
+ r.append((char*)&st, sizeof(st));
+
+ // open_by
+ for (map<int,int>::iterator it = open_by_nonce.begin();
+ it != open_by_nonce.end();
+ it++) {
+ int m = it->first;
+ r.append((char*)&m, sizeof(int));
+ int n = it->second;
+ r.append((char*)&n, sizeof(int));
+ }
+
+ // rep_by
+ for (set<int>::iterator it = rep_by.begin();
+ it != rep_by.end();
+ it++) {
+ int m = *it;
+ r.append((char*)&m, sizeof(int));
+ }
+
+ return r;
+ }
+
+ int _unrope(crope s, int off = 0) {
+ s.copy(off, sizeof(st), (char*)&st);
+ off += sizeof(st);
+
+ // open_by
+ for (int i=0; i<st.nopen_by; i++) {
+ int m,n;
+ s.copy(off, sizeof(int), (char*)&m);
+ off += sizeof(int);
+ s.copy(off, sizeof(int), (char*)&n);
+ off += sizeof(int);
+ open_by_nonce.insert(pair<int,int>(m,n));
+ }
+
+ // rep_by
+ for (int i=0; i<st.nrep_by; i++) {
+ int m;
+ s.copy(off, sizeof(int), (char*)&m);
+ off += sizeof(int);
+ rep_by.insert(m);
+ }
+
+ return off;
+ }
+
+};
+
+
+
#endif
out << "rep a=" << in.authority() << " n=" << in.get_replica_nonce();
assert(in.get_replica_nonce() >= 0);
}
+ if (in.is_pinned()) {
+ out << "pins";
+ for(set<int>::iterator it = in.get_ref_set().begin();
+ it != in.get_ref_set().end();
+ it++)
+ if (*it < CINODE_PIN_NUM)
+ out << " " << cinode_pin_names[*it];
+ else
+ out << " " << *it;
+ }
out << "]";
return out;
}
}
void CInode::auth_pin() {
- get(CINODE_PIN_AUTHPIN + auth_pins);
+ if (auth_pins == 0)
+ get(CINODE_PIN_AUTHPIN);
auth_pins++;
+
dout(7) << "auth_pin on " << *this << " count now " << auth_pins << " + " << nested_auth_pins << endl;
+
if (parent)
parent->dir->adjust_nested_auth_pins( 1 );
}
void CInode::auth_unpin() {
auth_pins--;
+ if (auth_pins == 0)
+ put(CINODE_PIN_AUTHPIN);
+
dout(7) << "auth_unpin on " << *this << " count now " << auth_pins << " + " << nested_auth_pins << endl;
- put(CINODE_PIN_AUTHPIN + auth_pins);
+
if (parent)
parent->dir->adjust_nested_auth_pins( -1 );
}
*/
// pins for keeping an item in cache (and debugging)
-#define CINODE_PIN_DIR 10000
-#define CINODE_PIN_CACHED 10001
-#define CINODE_PIN_DIRTY 10002 // must flush
-#define CINODE_PIN_PROXY 10004 // can't expire yet
-#define CINODE_PIN_WAITER 10005 // waiter
-
-#define CINODE_PIN_OPENRD 10020
-#define CINODE_PIN_OPENWR 10021
-#define CINODE_PIN_UNLINKING 10022
+#define CINODE_PIN_DIR 0
+#define CINODE_PIN_CACHED 1
+#define CINODE_PIN_DIRTY 2 // must flush
+#define CINODE_PIN_PROXY 3 // can't expire yet
+#define CINODE_PIN_WAITER 4 // waiter
+
+#define CINODE_PIN_OPENRD 5
+#define CINODE_PIN_OPENWR 6
+#define CINODE_PIN_UNLINKING 7
+
+#define CINODE_PIN_AUTHPIN 8
+
+#define CINODE_NUM_PINS 9
+static char *cinode_pin_names[CINODE_NUM_PINS] = {
+ "dir",
+ "cached",
+ "dirty",
+ "proxy",
+ "waiter",
+ "openrd",
+ "openwr",
+ "unlinking",
+ "authpin"
+};
-#define CINODE_PIN_AUTHPIN 30000
//#define CINODE_PIN_SYNCBYME 70000
// -- cached_by -- to be used ONLY when we're authoritative or cacheproxy
bool is_cached_by_anyone() { return !cached_by.empty(); }
bool is_cached_by(int mds) { return cached_by.count(mds); }
+ int num_cached_by() { return cached_by.size(); }
// cached_by_add returns a nonce
int cached_by_add(int mds) {
if (is_cached_by(mds)) { // already had it?
// -- reference counting --
+ bool is_pinned() { return ref > 0; }
+ set<int>& get_ref_set() { return ref_set; }
void put(int by) {
if (ref == 0 || ref_set.count(by) != 1) {
dout(7) << " bad put " << *this << " by " << by << " was " << ref << " (" << ref_set << ")" << endl;
-// encoded state
+// -- encoded state
+
+// discover
class CInodeDiscover {
bool is_softasync;
bool is_lockbyauth;
-
+ CInodeDiscover() {}
CInodeDiscover(CInode *in) {
inode = in->inode;
replica_nonce = in->get_replica_nonce();
}
int _unrope(crope s, int off = 0) {
- s.copy(0,sizeof(inode_t), (char*)&inode);
+ s.copy(off,sizeof(inode_t), (char*)&inode);
off += sizeof(inode_t);
s.copy(off, sizeof(int), (char*)&replica_nonce);
off += sizeof(int);
};
+// export
+
typedef struct {
inode_t inode;
__uint64_t version;
int ncached_by; // int pairs follow
} CInodeExport_st;
-
class CInodeExport {
CInodeExport_st st;
set<int> cached_by;
map<int,int> cached_by_nonce;
+ CInodeExport() {}
CInodeExport(CInode *in) {
st.inode = in->inode;
st.version = in->get_version();
cached_by_nonce = in->get_cached_by_nonce();
}
-
crope _rope() {
crope r;
-
+ st.ncached_by = cached_by.size();
+ r.append((char*)&st, sizeof(st));
+
+ // cached_by + nonce
+ for (map<int,int>::iterator it = cached_by_nonce.begin();
+ it != cached_by_nonce.end();
+ it++) {
+ int m = it->first;
+ r.append((char*)&m, sizeof(int));
+ int n = it->second;
+ r.append((char*)&n, sizeof(int));
+ }
+
return r;
}
- int _unrope(crope s, int off = 0) {
+ int _unrope(crope s, int off = 0) {
+ s.copy(off, sizeof(st), (char*)&st);
+ off += sizeof(st);
+
+ for (int i=0; i<st.ncached_by; i++) {
+ int m,n;
+ s.copy(off, sizeof(int), (char*)&m);
+ off += sizeof(int);
+ s.copy(off, sizeof(int), (char*)&n);
+ off += sizeof(int);
+ cached_by.insert(m);
+ cached_by_nonce.insert(pair<int,int>(m,n));
+ }
+ return off;
}
-
-
};