- probably kill base case in encoder.h, replace with int types, with appropriate swabbing?
- addr=?
+client leases
+- clean up readdir vs stat leases
+ - esp on client.. keep mask/ttl and onetime_mask/onetime_ttl?
+
kernel client
- make sure link/unlink results reflected by inode/dentry cache (let fill_trace do it? invalidate? do actual update?)
- procfs/debugfs
*
* insert + link a single dentry + inode into the metadata cache.
*/
-Inode* Client::insert_inode(Dir *dir, InodeStat *st, const string& dname)
+Inode* Client::insert_inode(Dir *dir, InodeStat *st, const string& dname, utime_t ttl)
{
Dentry *dn = NULL;
if (dir->dentries.count(dname))
} else {
// actually update info
dout(12) << " stat inode mask is " << st->mask << dendl;
- if (st->mask & STAT_MASK_BASE) {
+ if (st->mask & CEPH_STAT_MASK_INODE) {
dn->inode->inode = st->inode;
dn->inode->dirfragtree = st->dirfragtree; // FIXME look at the mask!
}
// ...but don't clobber our mtime, size!
/* isn't this handled below?
- if ((dn->inode->mask & STAT_MASK_SIZE) == 0 &&
+ if ((dn->inode->mask & CEPH_STAT_MASK_SIZE) == 0 &&
dn->inode->file_wr_size > dn->inode->inode.size)
dn->inode->inode.size = dn->inode->file_wr_size;
- if ((dn->inode->mask & STAT_MASK_MTIME) == 0 &&
+ if ((dn->inode->mask & CEPH_STAT_MASK_MTIME) == 0 &&
dn->inode->file_wr_mtime > dn->inode->inode.mtime)
dn->inode->inode.mtime = dn->inode->file_wr_mtime;
*/
// save the mask
dn->inode->mask = st->mask;
+ dn->inode->ttl = ttl;
// or do we have newer size/mtime from writing?
if (dn->inode->file_wr_size > dn->inode->inode.size)
inode_map[root->inode.ino] = root;
root->dir_auth = 0;
}
+ cur->ttl = ttl;
+ cur->mask = (*pin)->mask;
} else {
// not root.
Dir *dir = cur->open_dir();
assert(pdn != reply->get_trace_dn().end());
- cur = this->insert_inode(dir, *pin, *pdn);
+ cur = this->insert_inode(dir, *pin, *pdn, ttl);
dout(10) << "insert_trace dn " << *pdn << " ino " << (*pin)->inode.ino << " -> " << cur << dendl;
++pdn;
- // move to top of lru!
+ // touch dn
if (cur->dn) {
lru.lru_touch(cur->dn);
cur->dn->ttl = ttl;
}
}
- // set cache ttl
- if (g_conf.client_cache_stat_ttl) {
- cur->valid_until = now;
- cur->valid_until += g_conf.client_cache_stat_ttl;
- }
-
// update dir dist info
if (pdir == reply->get_trace_dir().end()) break;
update_dir_dist(cur, *pdir);
-
+/*
+ * bleh, dentry vs inode semantics here are sloppy
+ */
Dentry *Client::lookup(filepath& path)
{
dout(14) << "lookup " << path << dendl;
- Inode *cur = root;
+ Inode *cur;
+ if (path.get_ino()) {
+ if (inode_map.count(path.get_ino()))
+ cur = inode_map[path.get_ino()];
+ else
+ return NULL;
+ } else
+ cur = root;
if (!cur) return NULL;
Dentry *dn = 0;
Dir *dir = cur->dir;
if (dir->dentries.count(path[i])) {
dn = dir->dentries[path[i]];
- dout(14) << " hit dentry " << path[i] << " inode is " << dn->inode << " valid_until " << dn->inode->valid_until << dendl;
+ dout(14) << " hit dentry " << path[i] << " inode is " << dn->inode << " ttl " << dn->inode->ttl << dendl;
} else {
dout(14) << " dentry " << path[i] << " dne" << dendl;
return NULL;
return NULL; // not a dir
}
}
-
+
+ if (!dn)
+ dn = cur->dn;
if (dn) {
- dout(11) << "lookup '" << path << "' found " << dn->name << " inode " << dn->inode->inode.ino << " valid_until " << dn->inode->valid_until<< dendl;
+ dout(11) << "lookup '" << path << "' found " << dn->name << " inode " << dn->inode->inode.ino << " ttl " << dn->inode->ttl << dendl;
}
return dn;
// fuse assumes it's always there.
Inode *root;
filepath fpath("", 1);
- _do_lstat(fpath, STAT_MASK_ALL, &root);
+ _do_lstat(fpath, CEPH_STAT_MASK_INODE_ALL, &root);
_ll_get(root);
// trace?
{
Inode *in;
filepath fpath(path);
- int r = _do_lstat(fpath, STAT_MASK_BASE, &in);
+ int r = _do_lstat(fpath, CEPH_STAT_MASK_SYMLINK, &in);
if (r == 0 && !in->inode.is_symlink()) r = -EINVAL;
if (r == 0) {
// copy into buf (at most size bytes)
inode_t inode;
utime_t now = g_clock.real_now();
- if (dn &&
- now <= dn->inode->valid_until)
- dout(10) << "_lstat has inode " << fpath << " with mask " << dn->inode->mask << ", want " << mask << dendl;
+ if (dn) {
+ if (now <= dn->inode->ttl) {
+ dout(10) << "_lstat has inode " << fpath << " with mask " << dn->inode->mask << ", want " << mask << dendl;
+ } else {
+ dout(10) << "_lstat has EXPIRED (" << dn->inode->ttl << ") inode " << fpath
+ << " with mask " << dn->inode->mask << ", want " << mask
+ << dendl;
+ }
+ } else {
+ dout(10) << "_lstat has no dn for path " << fpath << dendl;
+ }
if (dn && dn->inode &&
- now <= dn->inode->valid_until &&
- ((mask & ~STAT_MASK_BASE) || now <= dn->inode->valid_until) &&
+ now <= dn->inode->ttl &&
+ ((mask & ~CEPH_STAT_MASK_INODE) || now <= dn->inode->ttl) &&
((dn->inode->mask & mask) == mask)) {
inode = dn->inode->inode;
- dout(10) << "lstat cache hit w/ sufficient mask, valid until " << dn->inode->valid_until << dendl;
+ dout(10) << "lstat cache hit w/ sufficient mask, valid until " << dn->inode->ttl << dendl;
- if (g_conf.client_cache_stat_ttl == 0)
- dn->inode->valid_until = utime_t(); // only one stat allowed after each readdir
+ //if (g_conf.client_cache_stat_ttl == 0)
+ //dn->inode->ttl = utime_t(); // only one stat allowed after each readdir
*in = dn->inode;
} else {
- // FIXME where does FUSE maintain user information
- //struct fuse_context *fc = fuse_get_context();
- //req->set_caller_uid(fc->uid);
- //req->set_caller_gid(fc->gid);
-
req = new MClientRequest(CEPH_MDS_OP_LSTAT, messenger->get_myinst());
req->head.args.stat.mask = mask;
req->set_filepath(fpath);
{
Inode *in = 0;
filepath fpath(path);
- int res = _do_lstat(fpath, STAT_MASK_ALL, &in);
+ int res = _do_lstat(fpath, CEPH_STAT_MASK_INODE_ALL, &in);
if (res == 0) {
assert(in);
fill_stat(in, stbuf);
// only open dir if we're actually adding stuff to it!
Dir *dir = diri->open_dir();
assert(dir);
- utime_t now = g_clock.real_now();
+ utime_t ttl = g_clock.real_now();
+ ttl += 60.0;
list<InodeStat*>::const_iterator pin = reply->get_dir_in().begin();
for (list<string>::const_iterator pdn = reply->get_dir_dn().begin();
res++;
// put in cache
- Inode *in = this->insert_inode(dir, *pin, *pdn);
-
- if (g_conf.client_cache_stat_ttl) {
- in->valid_until = now;
- in->valid_until += g_conf.client_cache_stat_ttl;
- }
- else if (g_conf.client_cache_readdir_ttl) {
- in->valid_until = now;
- in->valid_until += g_conf.client_cache_readdir_ttl;
- } else
- in->valid_until = utime_t();
+ Inode *in = this->insert_inode(dir, *pin, *pdn, ttl);
// contents to caller too!
dout(15) << "_readdir_get_frag got " << *pdn << " to " << in->inode.ino << dendl;
{
Inode *in = 0;
filepath fpath("", f->inode->ino());
- int res = _do_lstat(fpath, STAT_MASK_ALL, &in);
+ int res = _do_lstat(fpath, CEPH_STAT_MASK_INODE_ALL, &in);
if (res == 0) {
assert(in);
fill_stat(in, stbuf);
Inode *in = _ll_get_inode(ino);
filepath fpath("", in->ino());
- int res = _do_lstat(fpath, STAT_MASK_ALL, &in);
+ int res = _do_lstat(fpath, CEPH_STAT_MASK_INODE_ALL, &in);
if (res == 0)
fill_stat(in, attr);
return res;
class Inode {
public:
inode_t inode; // the actual inode
- utime_t valid_until;
- int mask;
+ int mask;
+ utime_t ttl;
// about the dir (if this is one!)
int dir_auth;
Inode(inode_t _inode, ObjectCacher *_oc) :
inode(_inode),
- valid_until(0, 0),
+ mask(0),
dir_auth(-1), dir_hashed(false), dir_replicated(false),
file_wr_mtime(0, 0), file_wr_size(0),
num_open_rd(0), num_open_wr(0), num_open_lazy(0),
void unlock_fh_pos(Fh *f);
// metadata cache
- Inode* insert_inode(Dir *dir, InodeStat *in_info, const string& dn);
+ Inode* insert_inode(Dir *dir, InodeStat *in_info, const string& dn, utime_t ttl);
void update_dir_dist(Inode *in, DirStat *st);
Inode* insert_trace(MClientReply *reply);
while (res == 0) {
int r = client->readdirplus_r(dirp, &de, &st, &stmask);
if (r != 0) break;
- int stneed = STAT_MASK_INO | STAT_MASK_TYPE;
+ int stneed = CEPH_STAT_MASK_INODE | CEPH_STAT_MASK_TYPE;
res = filler(buf,
de.d_name,
((stmask & stneed) == stneed) ? &st:0,
#define CEPH_MDS_STATE_STOPPING 13 /* up, exporting metadata */
+/*
+ * metadata/stat validity masks
+ */
+#define CEPH_STAT_MASK_INODE 1 /* immutable inode bits */
+#define CEPH_STAT_MASK_AUTH 2
+#define CEPH_STAT_MASK_LINK 4
+#define CEPH_STAT_MASK_FILE 8
+#define CEPH_STAT_MASK_INODE_ALL 15
+
+#define CEPH_STAT_MASK_DN 64 /* dentry */
+
+#define CEPH_STAT_MASK_TYPE CEPH_STAT_MASK_INODE /* mode >> 12 */
+#define CEPH_STAT_MASK_SYMLINK CEPH_STAT_MASK_INODE
+#define CEPH_STAT_MASK_LAYOUT CEPH_STAT_MASK_INODE
+#define CEPH_STAT_MASK_UID CEPH_STAT_MASK_AUTH
+#define CEPH_STAT_MASK_GID CEPH_STAT_MASK_AUTH
+#define CEPH_STAT_MASK_MODE CEPH_STAT_MASK_AUTH
+#define CEPH_STAT_MASK_NLINK CEPH_STAT_MASK_LINK
+#define CEPH_STAT_MASK_MTIME CEPH_STAT_MASK_FILE
+#define CEPH_STAT_MASK_SIZE CEPH_STAT_MASK_FILE
+#define CEPH_STAT_MASK_ATIME CEPH_STAT_MASK_FILE /* fixme */
+
+
/* client_session */
enum {
CEPH_SESSION_REQUEST_OPEN,
#define FILE_MODE_RW (1|2)
#define FILE_MODE_LAZY 4
-/** stat masks
- */
-#define STAT_MASK_INO 1 // inode nmber
-#define STAT_MASK_TYPE 2 // file type bits of the mode
-#define STAT_MASK_BASE 4 // layout, symlink value
-#define STAT_MASK_AUTH 8 // uid, gid, mode
-#define STAT_MASK_LINK 16 // nlink, anchored
-#define STAT_MASK_FILE 32 // mtime, size.
-
-#define STAT_MASK_ALL 63
-
-#define STAT_MASK_SIZE STAT_MASK_FILE // size, blksize, blocks
-#define STAT_MASK_MTIME STAT_MASK_FILE // mtime
-#define STAT_MASK_ATIME STAT_MASK_FILE // atime
-#define STAT_MASK_CTIME (STAT_MASK_FILE|STAT_MASK_AUTH|STAT_MASK_LINK) // ctime
inline int DT_TO_MODE(int dt) {
return dt << 12;
assert(lock->get_state() == LOCK_SYNC);
if (lock->get_parent()->is_replicated() ||
- lock->get_num_clients()) {
+ lock->get_parent()->is_client_replicated()) {
// bcast to mds replicas
send_lock_message(lock, LOCK_AC_LOCK);
// bcast to client replicas
- for (hash_map<int, ClientReplica*>::iterator p = lock->client_set.begin();
- p != lock->client_set.end();
+ for (hash_map<int, ClientReplica*>::iterator p = lock->get_parent()->client_replica_map.begin();
+ p != lock->get_parent()->client_replica_map.end();
p++) {
ClientReplica *r = p->second;
if (lock->get_type() == LOCK_OTYPE_DN) {
__u32 numi = 0;
utime_t ttl = g_clock.now();
ttl += 60.0; // FIXME
+ ClientReplica *r;
while (true) {
// inode
- InodeStat::_encode(bl, in);
+ r = in->get_client_replica(client);
+ r->ttl = ttl;
+ r->mask |= InodeStat::_encode(bl, in);
numi++;
+
CDentry *dn = in->get_parent_dn();
if (!dn) break;
// dentry
::_encode_simple(dn->get_name(), bl);
- ClientReplica *r = dn->lock.client_set[client];
- if (!r)
- r = dn->lock.client_set[client] = new ClientReplica(client, &dn->lock);
+ r = dn->get_client_replica(client);
r->ttl = ttl;
+ r->mask = CEPH_STAT_MASK_DN;
session->replicas.push_back(&r->session_replica_item);
// dir
set<SimpleLock*> xlocks = mdr->xlocks;
int mask = req->head.args.stat.mask;
- if (mask & STAT_MASK_LINK) rdlocks.insert(&ref->linklock);
- if (mask & STAT_MASK_AUTH) rdlocks.insert(&ref->authlock);
+ if (mask & CEPH_STAT_MASK_LINK) rdlocks.insert(&ref->linklock);
+ if (mask & CEPH_STAT_MASK_AUTH) rdlocks.insert(&ref->authlock);
if (ref->is_file() &&
- mask & STAT_MASK_FILE) rdlocks.insert(&ref->filelock);
+ mask & CEPH_STAT_MASK_FILE) rdlocks.insert(&ref->filelock);
if (ref->is_dir() &&
- mask & STAT_MASK_MTIME) rdlocks.insert(&ref->dirlock);
+ mask & CEPH_STAT_MASK_MTIME) rdlocks.insert(&ref->dirlock);
if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
return;
int num_rdlock;
MDRequest *xlock_by;
-public:
- hash_map<int, ClientReplica*> client_set; // auth+rep
-
public:
SimpleLock(MDSCacheObject *o, int t, int wo) :
p != parent->replicas_end();
++p)
gather_set.insert(p->first);
- for (hash_map<int,ClientReplica*>::const_iterator p = client_set.begin();
- p != client_set.end();
+ for (hash_map<int,ClientReplica*>::const_iterator p = parent->client_replica_map.begin();
+ p != parent->client_replica_map.end();
p++)
gather_set.insert(-1 - p->second->client);
}
}
MDRequest *get_xlocked_by() { return xlock_by; }
- int get_num_clients() { return client_set.size(); }
-
bool is_used() {
- return is_xlocked() || is_rdlocked() || !client_set.empty();
+ return is_xlocked() || is_rdlocked() || !parent->client_replica_map.empty();
}
// encode/decode
out << get_lock_type_name(get_type()) << " ";
out << get_simplelock_state_name(get_state());
if (!get_gather_set().empty()) out << " g=" << get_gather_set();
- if (!client_set.empty())
- out << " c=" << client_set.size();
+ if (!parent->client_replica_map.empty())
+ out << " c=" << parent->client_replica_map.size();
if (is_rdlocked())
out << " r=" << get_num_rdlocks();
if (is_xlocked())
//#define CDIR_AUTH_ROOTINODE pair<int,int>( 0, -2)
+/*
+ * for metadata leases to clients
+ */
+struct ClientReplica {
+ int client;
+ int mask; // CEPH_STAT_MASK_*
+ utime_t ttl;
+ MDSCacheObject *parent;
+ xlist<ClientReplica*>::item session_replica_item;
+ ClientReplica(int c) :
+ client(c), mask(0),
+ session_replica_item(this) { }
+};
+
// print hack
struct mdsco_db_line_prefix {
// --------------------------------------------
- // replication
+ // replication (across mds cluster)
protected:
- map<int,int> replica_map; // [auth] mds -> nonce
+ map<int,int> replica_map; // [auth] mds -> nonce
int replica_nonce; // [replica] defined on replica
public:
void set_replica_nonce(int n) { replica_nonce = n; }
+ // ---------------------------------------------
+ // replicas (on clients)
+ public:
+ hash_map<int,ClientReplica*> client_replica_map;
+
+ ClientReplica *get_client_replica(int c) {
+ if (client_replica_map.count(c))
+ return client_replica_map[c];
+ else
+ return client_replica_map[c] = new ClientReplica(c);
+ }
+ bool is_client_replicated() {
+ return !client_replica_map.empty();
+ }
+
+
// ---------------------------------------------
// waiting
protected:
}
-struct ClientReplica {
- int client;
- SimpleLock *lock;
- utime_t ttl;
- xlist<ClientReplica*>::item session_replica_item;
- ClientReplica(int c, SimpleLock *l) : client(c), lock(l),
- session_replica_item(this) { }
-};
#endif
mask = e.mask;
}
- static void _encode(bufferlist &bl, CInode *in) {
- int mask = STAT_MASK_INO|STAT_MASK_TYPE|STAT_MASK_BASE;
-
+ static int _encode(bufferlist &bl, CInode *in) {
// mask
- if (in->authlock.can_rdlock(0)) mask |= STAT_MASK_AUTH;
- if (in->linklock.can_rdlock(0)) mask |= STAT_MASK_LINK;
- if (in->filelock.can_rdlock(0)) mask |= STAT_MASK_FILE;
+ int mask = CEPH_STAT_MASK_INODE;
+ if (in->authlock.can_rdlock(0)) mask |= CEPH_STAT_MASK_AUTH;
+ if (in->linklock.can_rdlock(0)) mask |= CEPH_STAT_MASK_LINK;
+ if (in->filelock.can_rdlock(0)) mask |= CEPH_STAT_MASK_FILE;
/*
* note: encoding matches struct ceph_client_reply_inode
::_encode_simple(p->second, bl);
}
::_encode_simple(in->symlink, bl);
+
+ return mask;
}
};