From 13608f20c06770d870e6f33d51ce18a3a0d716aa Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 21 Jul 2008 16:15:59 -0700 Subject: [PATCH] mds: separate nest_info_t from frag_info_t, covered by a new CInode lock --- src/client/Client.cc | 7 +- src/client/Client.h | 2 +- src/include/ceph_fs.h | 5 +- src/include/utime.h | 4 + src/mds/CDir.cc | 33 ++--- src/mds/CDir.h | 2 + src/mds/CInode.cc | 126 ++++++++++++++++-- src/mds/CInode.h | 17 ++- src/mds/Locker.cc | 17 ++- src/mds/LogSegment.h | 1 + src/mds/MDCache.cc | 235 ++++++++++++++++++++------------- src/mds/MDCache.h | 5 +- src/mds/Server.cc | 70 ++++++---- src/mds/SimpleLock.h | 1 + src/mds/journal.cc | 6 + src/mds/mdstypes.h | 108 +++++++++++---- src/messages/MClientReply.h | 10 +- src/messages/MMDSCacheRejoin.h | 16 ++- 18 files changed, 471 insertions(+), 194 deletions(-) diff --git a/src/client/Client.cc b/src/client/Client.cc index 7ccfdda97d1e5..fe86aa642a069 100644 --- a/src/client/Client.cc +++ b/src/client/Client.cc @@ -388,6 +388,7 @@ void Client::update_inode(Inode *in, InodeStat *st, LeaseStat *lease, utime_t fr in->dirfragtree = st->dirfragtree; // FIXME look at the mask! in->xattrs.swap(st->xattrs); in->inode.dirstat = st->dirstat; + in->inode.rstat = st->rstat; in->inode.ctime = st->ctime; in->inode.max_size = st->max_size; // right? @@ -2500,7 +2501,7 @@ int Client::_do_lstat(const filepath &path, int mask, Inode **in, int uid, int g } -int Client::fill_stat(Inode *in, struct stat *st, frag_info_t *dirstat) +int Client::fill_stat(Inode *in, struct stat *st, frag_info_t *dirstat, nest_info_t *rstat) { dout(10) << "fill_stat on " << in->inode.ino << " snap/dev" << in->snapid << " mode 0" << oct << in->inode.mode << dec @@ -2518,7 +2519,7 @@ int Client::fill_stat(Inode *in, struct stat *st, frag_info_t *dirstat) st->st_mtime = in->inode.mtime; if (in->inode.is_dir()) { //st->st_size = in->inode.dirstat.size(); - st->st_size = in->inode.dirstat.rbytes; + st->st_size = in->inode.rstat.rbytes; st->st_blocks = 1; } else { st->st_size = in->inode.size; @@ -2528,6 +2529,8 @@ int Client::fill_stat(Inode *in, struct stat *st, frag_info_t *dirstat) if (dirstat) *dirstat = in->inode.dirstat; + if (rstat) + *rstat = in->inode.rstat; return in->lease_mask; } diff --git a/src/client/Client.h b/src/client/Client.h index 827da2a166196..cafe29e899617 100644 --- a/src/client/Client.h +++ b/src/client/Client.h @@ -788,7 +788,7 @@ protected: // find dentry based on filepath Dentry *lookup(const filepath& path, snapid_t snap=CEPH_NOSNAP); - int fill_stat(Inode *in, struct stat *st, frag_info_t *dirstat=0); + int fill_stat(Inode *in, struct stat *st, frag_info_t *dirstat=0, nest_info_t *rstat=0); // trace generation diff --git a/src/include/ceph_fs.h b/src/include/ceph_fs.h index fc5afaf6709c8..a68e211b5e0ce 100644 --- a/src/include/ceph_fs.h +++ b/src/include/ceph_fs.h @@ -482,8 +482,9 @@ struct ceph_mds_getmap { #define CEPH_LOCK_ILINK 16 #define CEPH_LOCK_IDFT 32 /* dir frag tree */ #define CEPH_LOCK_IDIR 64 /* mds internal */ -#define CEPH_LOCK_IXATTR 128 -#define CEPH_LOCK_ISNAP 256 +#define CEPH_LOCK_INEST 128 /* mds internal */ +#define CEPH_LOCK_IXATTR 256 +#define CEPH_LOCK_ISNAP 512 #define CEPH_LOCK_INO 2048 /* immutable inode bits; not actually a lock */ #define CEPH_LOCK_ICONTENT (CEPH_LOCK_IFILE|CEPH_LOCK_IDIR) /* alias for either filelock or dirlock */ diff --git a/src/include/utime.h b/src/include/utime.h index 799b7240b16e2..18243328bedb8 100644 --- a/src/include/utime.h +++ b/src/include/utime.h @@ -152,6 +152,10 @@ inline bool operator<(const utime_t& a, const utime_t& b) { return (a.sec() < b.sec()) || (a.sec() == b.sec() && a.usec() < b.usec()); } +inline bool operator==(const utime_t& a, const utime_t& b) +{ + return a.sec() == b.sec() && a.usec() == b.usec(); +} // ostream inline std::ostream& operator<<(std::ostream& out, const utime_t& t) diff --git a/src/mds/CDir.cc b/src/mds/CDir.cc index c6fde12b632f7..01a3a22544337 100644 --- a/src/mds/CDir.cc +++ b/src/mds/CDir.cc @@ -99,9 +99,9 @@ ostream& operator<<(ostream& out, CDir& dir) out << " s=" << dir.fnode.fragstat.size() << "=" << dir.fnode.fragstat.nfiles << "+" << dir.fnode.fragstat.nsubdirs; - out << " rb=" << dir.fnode.fragstat.rbytes << "/" << dir.fnode.accounted_fragstat.rbytes; - out << " rf=" << dir.fnode.fragstat.rfiles << "/" << dir.fnode.accounted_fragstat.rfiles; - out << " rd=" << dir.fnode.fragstat.rsubdirs << "/" << dir.fnode.accounted_fragstat.rsubdirs; + out << " rb=" << dir.fnode.rstat.rbytes << "/" << dir.fnode.accounted_rstat.rbytes; + out << " rf=" << dir.fnode.rstat.rfiles << "/" << dir.fnode.accounted_rstat.rfiles; + out << " rd=" << dir.fnode.rstat.rsubdirs << "/" << dir.fnode.accounted_rstat.rsubdirs; out << " hs=" << dir.get_num_head_items() << "+" << dir.get_num_head_null(); out << ",ss=" << dir.get_num_snap_items() << "+" << dir.get_num_snap_null(); @@ -556,11 +556,13 @@ void CDir::steal_dentry(CDentry *dn) fnode.fragstat.nsubdirs++; else fnode.fragstat.nfiles++; - fnode.fragstat.rbytes += pi->accounted_dirstat.rbytes; - fnode.fragstat.rfiles += pi->accounted_dirstat.rfiles; - fnode.fragstat.rsubdirs += pi->accounted_dirstat.rsubdirs; - if (pi->accounted_dirstat.rctime > fnode.fragstat.rctime) - fnode.fragstat.rctime = pi->accounted_dirstat.rctime; + fnode.rstat.rbytes += pi->accounted_rstat.rbytes; + fnode.rstat.rfiles += pi->accounted_rstat.rfiles; + fnode.rstat.rsubdirs += pi->accounted_rstat.rsubdirs; + fnode.rstat.ranchors += pi->accounted_rstat.ranchors; + fnode.rstat.rsnaprealms += pi->accounted_rstat.ranchors; + if (pi->accounted_rstat.rctime > fnode.rstat.rctime) + fnode.rstat.rctime = pi->accounted_rstat.rctime; } else if (dn->is_remote()) { if (dn->get_remote_d_type() == (S_IFDIR >> 12)) fnode.fragstat.nsubdirs++; @@ -625,13 +627,12 @@ void CDir::split(int bits, list& subs, list& waiters, bool repl double fac = 1.0 / (double)(1 << bits); // for scaling load vecs - frag_info_t olddiff; // old += f - af; - bool changed_mtime; - dout(10) << " fragstat " << fnode.fragstat << dendl; - dout(10) << " accounted_fragstat " << fnode.accounted_fragstat << dendl; + nest_info_t olddiff; // old += f - af; + dout(10) << " rstat " << fnode.rstat << dendl; + dout(10) << " accounted_rstat " << fnode.accounted_rstat << dendl; olddiff.zero(); - olddiff.take_diff(fnode.fragstat, fnode.accounted_fragstat, changed_mtime); - dout(10) << " olddiff " << olddiff << dendl; + olddiff.take_diff(fnode.rstat, fnode.accounted_rstat); + dout(10) << " olddiff " << olddiff << dendl; // create subfrag dirs int n = 0; @@ -682,9 +683,9 @@ void CDir::split(int bits, list& subs, list& waiters, bool repl // give any outstanding frag stat differential to first frag // af[0] -= olddiff dout(10) << "giving olddiff " << olddiff << " to " << *subfrags[0] << dendl; - frag_info_t zero; + nest_info_t zero; zero.zero(); - subfrags[0]->fnode.accounted_fragstat.take_diff(zero, olddiff, changed_mtime); + subfrags[0]->fnode.accounted_rstat.take_diff(zero, olddiff); dout(10) << " " << subfrags[0]->fnode.accounted_fragstat << dendl; purge_stolen(waiters, replay); diff --git a/src/mds/CDir.h b/src/mds/CDir.h index fd2e98e0a9374..7ce6d31c4591d 100644 --- a/src/mds/CDir.h +++ b/src/mds/CDir.h @@ -157,6 +157,8 @@ class CDir : public MDSCacheObject { } fnode_t fnode; + snapid_t first; + map dirty_old_fnodes; protected: version_t projected_version; diff --git a/src/mds/CInode.cc b/src/mds/CInode.cc index 58fcb7f1a7c45..322f63548506c 100644 --- a/src/mds/CInode.cc +++ b/src/mds/CInode.cc @@ -103,12 +103,12 @@ ostream& operator<<(ostream& out, CInode& in) out << " nl=" << in.inode.nlink; } - out << " rb=" << in.inode.dirstat.rbytes; - if (in.is_projected()) out << "/" << in.inode.accounted_dirstat.rbytes; - out << " rf=" << in.inode.dirstat.rfiles; - if (in.is_projected()) out << "/" << in.inode.accounted_dirstat.rfiles; - out << " rd=" << in.inode.dirstat.rsubdirs; - if (in.is_projected()) out << "/" << in.inode.accounted_dirstat.rsubdirs; + out << " rb=" << in.inode.rstat.rbytes; + if (in.is_projected()) out << "/" << in.inode.accounted_rstat.rbytes; + out << " rf=" << in.inode.rstat.rfiles; + if (in.is_projected()) out << "/" << in.inode.accounted_rstat.rfiles; + out << " rd=" << in.inode.rstat.rsubdirs; + if (in.is_projected()) out << "/" << in.inode.accounted_rstat.rsubdirs; // locks out << " " << in.authlock; @@ -117,6 +117,7 @@ ostream& operator<<(ostream& out, CInode& in) out << " " << in.dirfragtreelock; out << " " << in.dirlock; out << " " << in.snaplock; + out << " " << in.nestlock; } else out << " " << in.filelock; out << " " << in.xattrlock; @@ -596,6 +597,30 @@ void CInode::encode_lock_state(int type, bufferlist& bl) } break; + case CEPH_LOCK_INEST: + { + dout(15) << "encode_lock_state inode.rstat is " << inode.rstat << dendl; + ::encode(inode.rstat, bl); // only meaningful if i am auth. + bufferlist tmp; + __u32 n = 0; + for (map::iterator p = dirfrags.begin(); + p != dirfrags.end(); + ++p) + if (is_auth() || p->second->is_auth()) { + dout(15) << "encode_lock_state rstat for " << *p->second << dendl; + dout(20) << " rstat " << p->second->fnode.rstat << dendl; + dout(20) << " accounted_rstat " << p->second->fnode.accounted_rstat << dendl; + frag_t fg = p->second->dirfrag().frag; + ::encode(fg, tmp); + ::encode(p->second->fnode.rstat, tmp); + ::encode(p->second->fnode.accounted_rstat, tmp); + n++; + } + ::encode(n, bl); + bl.claim_append(tmp); + } + break; + case CEPH_LOCK_IXATTR: ::encode(xattrs, bl); break; @@ -603,6 +628,7 @@ void CInode::encode_lock_state(int type, bufferlist& bl) case CEPH_LOCK_ISNAP: encode_snap(bl); break; + default: assert(0); @@ -720,6 +746,52 @@ void CInode::decode_lock_state(int type, bufferlist& bl) } break; + case CEPH_LOCK_INEST: + { + nest_info_t rstat; + ::decode(rstat, p); + if (!is_auth()) { + dout(10) << " taking inode rstat " << rstat << " for " << *this << dendl; + inode.rstat = rstat; // take inode summation if replica + } + __u32 n; + ::decode(n, p); + dout(10) << " ...got " << n << " rstats on " << *this << dendl; + while (n--) { + frag_t fg; + nest_info_t rstat; + nest_info_t accounted_rstat; + ::decode(fg, p); + ::decode(rstat, p); + ::decode(accounted_rstat, p); + dout(10) << fg << " got changed rstat " << rstat << dendl; + dout(20) << fg << " accounted_rstat " << accounted_rstat << dendl; + + CDir *dir = get_dirfrag(fg); + if (is_auth()) { + assert(dir); // i am auth; i had better have this dir open + dout(10) << " " << fg << " rstat " << rstat << " on " << *dir << dendl; + dout(20) << " " << fg << " accounted_rstat " << accounted_rstat << dendl; + dir->fnode.rstat = rstat; + dir->fnode.accounted_rstat = accounted_rstat; + if (!(rstat == accounted_rstat)) + dirlock.set_updated(); + } else { + if (dir && + dir->is_auth() && + !(dir->fnode.accounted_rstat == rstat)) { + dout(10) << " setting accounted_rstat " << rstat << " and setting dirty bit on " + << *dir << dendl; + fnode_t *pf = dir->get_projected_fnode(); + pf->accounted_rstat = rstat; + if (dir->is_auth()) + dir->_set_dirty_flag(); // bit of a hack + } + } + } + } + break; + case CEPH_LOCK_IXATTR: ::decode(xattrs, p); break; @@ -741,6 +813,10 @@ void CInode::clear_dirty_scattered(int type) xlist_dirty_dirfrag_dir.remove_myself(); break; + case CEPH_LOCK_INEST: + xlist_dirty_dirfrag_nest.remove_myself(); + break; + case CEPH_LOCK_IDFT: xlist_dirty_dirfrag_dirfragtree.remove_myself(); break; @@ -789,6 +865,34 @@ void CInode::finish_scatter_gather_update(int type) } break; + case CEPH_LOCK_INEST: + { + // adjust summation + assert(is_auth()); + inode_t *pi = get_projected_inode(); + dout(20) << " orig rstat " << pi->rstat << dendl; + for (map::iterator p = dirfrags.begin(); + p != dirfrags.end(); + p++) { + fnode_t *pf = p->second->get_projected_fnode(); + if (pf->accounted_rstat.version == pi->rstat.version) { + dout(20) << " frag " << p->first << " " << *p->second << dendl; + dout(20) << " rstat " << pf->rstat << dendl; + dout(20) << " accounted_rstat " << pf->rstat << dendl; + pi->rstat.take_diff(pf->rstat, + pf->accounted_rstat); + } else { + dout(20) << " frag " << p->first << " on " << *p->second << dendl; + dout(20) << " ignoring OLD accounted_rstat " << pf->rstat << dendl; + } + } + pi->rstat.version++; + dout(20) << " final rstat " << pi->rstat << dendl; + assert(pi->rstat.rfiles >= 0); + assert(pi->rstat.rsubdirs >= 0); + } + break; + case CEPH_LOCK_IDFT: break; @@ -1081,10 +1185,10 @@ void CInode::encode_inodestat(bufferlist& bl, snapid_t snapid) e.files = i->dirstat.nfiles; e.subdirs = i->dirstat.nsubdirs; - i->dirstat.rctime.encode_timeval(&e.rctime); - e.rbytes = i->dirstat.rbytes; - e.rfiles = i->dirstat.rfiles; - e.rsubdirs = i->dirstat.rsubdirs; + i->rstat.rctime.encode_timeval(&e.rctime); + e.rbytes = i->rstat.rbytes; + e.rfiles = i->rstat.rfiles; + e.rsubdirs = i->rstat.rsubdirs; e.rdev = i->rdev; e.fragtree.nsplits = dirfragtree._splits.size(); @@ -1129,6 +1233,7 @@ void CInode::encode_export(bufferlist& bl) ::encode(dirlock, bl); ::encode(xattrlock, bl); ::encode(snaplock, bl); + ::encode(nestlock, bl); get(PIN_TEMPEXPORTING); } @@ -1173,4 +1278,5 @@ void CInode::decode_import(bufferlist::iterator& p, ::decode(dirlock, p); ::decode(xattrlock, p); ::decode(snaplock, p); + ::decode(nestlock, p); } diff --git a/src/mds/CInode.h b/src/mds/CInode.h index 062207b954489..57b6172443a33 100644 --- a/src/mds/CInode.h +++ b/src/mds/CInode.h @@ -118,6 +118,7 @@ class CInode : public MDSCacheObject { static const int WAIT_VERSIONLOCK_OFFSET = 4 + 4*SimpleLock::WAIT_BITS; static const int WAIT_XATTRLOCK_OFFSET = 4 + 5*SimpleLock::WAIT_BITS; static const int WAIT_SNAPLOCK_OFFSET = 4 + 6*SimpleLock::WAIT_BITS; + static const int WAIT_NESTLOCK_OFFSET = 4 + 7*SimpleLock::WAIT_BITS; static const int WAIT_ANY_MASK = (0xffffffff); @@ -137,8 +138,9 @@ class CInode : public MDSCacheObject { SnapRealm *snaprealm; SnapRealm *containing_realm; - snapid_t first, last; // last=0 => multiversion or head. + snapid_t first, last; map old_inodes; // key = last, value.first = first + set dirty_old_dirstats; bool is_multiversion() { return snaprealm || inode.is_dir(); } snapid_t get_oldest_snap(); @@ -233,6 +235,7 @@ public: xlist::item xlist_caps; xlist::item xlist_open_file; xlist::item xlist_dirty_dirfrag_dir; + xlist::item xlist_dirty_dirfrag_nest; xlist::item xlist_dirty_dirfrag_dirfragtree; xlist::item xlist_purging_inode; @@ -275,6 +278,7 @@ private: replica_caps_wanted(0), xlist_dirty(this), xlist_caps(this), xlist_open_file(this), xlist_dirty_dirfrag_dir(this), + xlist_dirty_dirfrag_nest(this), xlist_dirty_dirfrag_dirfragtree(this), xlist_purging_inode(this), auth_pins(0), nested_auth_pins(0), @@ -286,7 +290,8 @@ private: filelock(this, CEPH_LOCK_IFILE, WAIT_FILELOCK_OFFSET), dirlock(this, CEPH_LOCK_IDIR, WAIT_DIRLOCK_OFFSET), xattrlock(this, CEPH_LOCK_IXATTR, WAIT_XATTRLOCK_OFFSET), - snaplock(this, CEPH_LOCK_ISNAP, WAIT_SNAPLOCK_OFFSET) + snaplock(this, CEPH_LOCK_ISNAP, WAIT_SNAPLOCK_OFFSET), + nestlock(this, CEPH_LOCK_INEST, WAIT_NESTLOCK_OFFSET) { memset(&inode, 0, sizeof(inode)); state = 0; @@ -382,6 +387,7 @@ public: ScatterLock dirlock; SimpleLock xattrlock; SimpleLock snaplock; + ScatterLock nestlock; SimpleLock* get_lock(int type) { switch (type) { @@ -392,6 +398,7 @@ public: case CEPH_LOCK_IDIR: return &dirlock; case CEPH_LOCK_IXATTR: return &xattrlock; case CEPH_LOCK_ISNAP: return &snaplock; + case CEPH_LOCK_INEST: return &nestlock; } return 0; } @@ -567,6 +574,7 @@ public: dirlock.replicate_relax(); xattrlock.replicate_relax(); snaplock.replicate_relax(); + nestlock.replicate_relax(); } @@ -680,6 +688,7 @@ class CInodeDiscover { __u32 dirlock_state; __u32 xattrlock_state; __u32 snaplock_state; + __u32 nestlock_state; public: CInodeDiscover() {} @@ -698,6 +707,7 @@ class CInodeDiscover { dirlock_state = in->dirlock.get_replica_state(); xattrlock_state = in->xattrlock.get_replica_state(); snaplock_state = in->snaplock.get_replica_state(); + nestlock_state = in->nestlock.get_replica_state(); } CInodeDiscover(bufferlist::iterator &p) { decode(p); @@ -723,6 +733,7 @@ class CInodeDiscover { in->dirlock.set_state(dirlock_state); in->xattrlock.set_state(xattrlock_state); in->snaplock.set_state(snaplock_state); + in->nestlock.set_state(nestlock_state); } void encode(bufferlist &bl) const { @@ -738,6 +749,7 @@ class CInodeDiscover { ::encode(dirlock_state, bl); ::encode(xattrlock_state, bl); ::encode(snaplock_state, bl); + ::encode(nestlock_state, bl); } void decode(bufferlist::iterator &p) { @@ -753,6 +765,7 @@ class CInodeDiscover { ::decode(dirlock_state, p); ::decode(xattrlock_state, p); ::decode(snaplock_state, p); + ::decode(nestlock_state, p); } }; diff --git a/src/mds/Locker.cc b/src/mds/Locker.cc index 911b1f11a791e..900772aed6d90 100644 --- a/src/mds/Locker.cc +++ b/src/mds/Locker.cc @@ -348,6 +348,7 @@ void Locker::eval_gather(SimpleLock *lock) return file_eval_gather((FileLock*)lock); case CEPH_LOCK_IDFT: case CEPH_LOCK_IDIR: + case CEPH_LOCK_INEST: return scatter_eval_gather((ScatterLock*)lock); default: return simple_eval_gather(lock); @@ -361,6 +362,7 @@ bool Locker::rdlock_start(SimpleLock *lock, MDRequest *mut) return file_rdlock_start((FileLock*)lock, mut); case CEPH_LOCK_IDFT: case CEPH_LOCK_IDIR: + case CEPH_LOCK_INEST: return scatter_rdlock_start((ScatterLock*)lock, mut); default: return simple_rdlock_start(lock, mut); @@ -374,6 +376,7 @@ void Locker::rdlock_finish(SimpleLock *lock, Mutation *mut) return file_rdlock_finish((FileLock*)lock, mut); case CEPH_LOCK_IDFT: case CEPH_LOCK_IDIR: + case CEPH_LOCK_INEST: return scatter_rdlock_finish((ScatterLock*)lock, mut); default: return simple_rdlock_finish(lock, mut); @@ -385,6 +388,7 @@ bool Locker::wrlock_start(SimpleLock *lock, MDRequest *mut) switch (lock->get_type()) { case CEPH_LOCK_IDFT: case CEPH_LOCK_IDIR: + case CEPH_LOCK_INEST: return scatter_wrlock_start((ScatterLock*)lock, mut); case CEPH_LOCK_IVERSION: return local_wrlock_start((LocalLock*)lock, mut); @@ -401,6 +405,7 @@ void Locker::wrlock_finish(SimpleLock *lock, Mutation *mut) switch (lock->get_type()) { case CEPH_LOCK_IDFT: case CEPH_LOCK_IDIR: + case CEPH_LOCK_INEST: return scatter_wrlock_finish((ScatterLock*)lock, mut); case CEPH_LOCK_IVERSION: return local_wrlock_finish((LocalLock*)lock, mut); @@ -420,6 +425,7 @@ bool Locker::xlock_start(SimpleLock *lock, MDRequest *mut) return local_xlock_start((LocalLock*)lock, mut); case CEPH_LOCK_IDFT: case CEPH_LOCK_IDIR: + case CEPH_LOCK_INEST: return scatter_xlock_start((ScatterLock*)lock, mut); default: return simple_xlock_start(lock, mut); @@ -435,6 +441,7 @@ void Locker::xlock_finish(SimpleLock *lock, Mutation *mut) return local_xlock_finish((LocalLock*)lock, mut); case CEPH_LOCK_IDFT: case CEPH_LOCK_IDIR: + case CEPH_LOCK_INEST: return scatter_xlock_finish((ScatterLock*)lock, mut); default: return simple_xlock_finish(lock, mut); @@ -878,7 +885,7 @@ bool Locker::check_inode_max_size(CInode *in, bool forceupdate, __u64 new_size) dout(10) << "check_inode_max_size also forcing size " << pi->size << " -> " << new_size << dendl; pi->size = new_size; - pi->dirstat.rbytes = new_size; + pi->rstat.rbytes = new_size; } EOpen *le = new EOpen(mds->mdlog); @@ -1118,7 +1125,7 @@ void Locker::_do_cap_update(CInode *in, int had, int all_wanted, snapid_t follow dout(7) << " size " << pi->size << " -> " << size << " for " << *in << dendl; pi->size = size; - pi->dirstat.rbytes = size; + pi->rstat.rbytes = size; } if (dirty_atime) { dout(7) << " atime " << pi->atime << " -> " << atime @@ -1134,8 +1141,7 @@ void Locker::_do_cap_update(CInode *in, int had, int all_wanted, snapid_t follow mut->ls = mds->mdlog->get_current_segment(); file_wrlock_force(&in->filelock, mut); // wrlock for duration of journal mut->auth_pin(in); - mdcache->predirty_journal_parents(mut, &le->metablob, in, 0, PREDIRTY_PRIMARY, false); - + mdcache->predirty_journal_parents(mut, &le->metablob, in, 0, PREDIRTY_PRIMARY, 0, follows); mdcache->journal_dirty_inode(&le->metablob, in, follows); mds->mdlog->submit_entry(le, new C_Locker_FileUpdate_finish(this, in, mut, change_max)); @@ -1336,6 +1342,7 @@ SimpleLock *Locker::get_lock(int lock_type, MDSCacheObjectInfo &info) case CEPH_LOCK_IDFT: case CEPH_LOCK_IFILE: case CEPH_LOCK_IDIR: + case CEPH_LOCK_INEST: case CEPH_LOCK_IXATTR: case CEPH_LOCK_ISNAP: { @@ -1350,6 +1357,7 @@ SimpleLock *Locker::get_lock(int lock_type, MDSCacheObjectInfo &info) case CEPH_LOCK_IDFT: return &in->dirfragtreelock; case CEPH_LOCK_IFILE: return &in->filelock; case CEPH_LOCK_IDIR: return &in->dirlock; + case CEPH_LOCK_INEST: return &in->nestlock; case CEPH_LOCK_IXATTR: return &in->xattrlock; case CEPH_LOCK_ISNAP: return &in->snaplock; } @@ -1390,6 +1398,7 @@ void Locker::handle_lock(MLock *m) case CEPH_LOCK_IDFT: case CEPH_LOCK_IDIR: + case CEPH_LOCK_INEST: handle_scatter_lock((ScatterLock*)lock, m); break; diff --git a/src/mds/LogSegment.h b/src/mds/LogSegment.h index fa117786af9c3..80e0af1dda17b 100644 --- a/src/mds/LogSegment.h +++ b/src/mds/LogSegment.h @@ -42,6 +42,7 @@ class LogSegment { xlist open_files; xlist dirty_dirfrag_dir; + xlist dirty_dirfrag_nest; xlist dirty_dirfrag_dirfragtree; xlist slave_updates; diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc index 7a392df19be75..4a95d0b673da2 100644 --- a/src/mds/MDCache.cc +++ b/src/mds/MDCache.cc @@ -532,10 +532,13 @@ void MDCache::eval_subtree_root(CDir *dir) if (dir->inode->is_auth() && dir->inode->dirlock.is_stable()) { // force the issue a bit - if (!dir->inode->is_frozen()) + if (!dir->inode->is_frozen()) { mds->locker->scatter_eval(&dir->inode->dirlock); - else + mds->locker->scatter_eval(&dir->inode->nestlock); + } else { mds->locker->try_scatter_eval(&dir->inode->dirlock); // ** may or may not be auth_pinned ** + mds->locker->try_scatter_eval(&dir->inode->nestlock); // ** may or may not be auth_pinned ** + } } } @@ -933,7 +936,7 @@ int MDCache::num_subtrees_fullnonauth() // =================================== -// journal helpers +// journal and snap/cow helpers /* * find first inode in cache that follows given snapid. otherwise, return current. @@ -1011,14 +1014,9 @@ void MDCache::journal_cow_dentry(EMetaBlob *metablob, CDentry *dn, snapid_t foll { dout(10) << "journal_cow_dentry follows " << follows << " on " << *dn << dendl; - // nothing to cow on a null dentry + // nothing to cow on a null dentry, fix caller assert(!dn->is_null()); - /* - * normally, we write to the head, and make a clone of ther previous - * dentry+inode state. unless the follow snapid specified. - */ - if (dn->is_primary() && dn->inode->is_multiversion()) { // multiversion inode. CInode *in = dn->inode; @@ -1034,6 +1032,9 @@ void MDCache::journal_cow_dentry(EMetaBlob *metablob, CDentry *dn, snapid_t foll old.first = in->first; old.inode = *in->get_previous_projected_inode(); old.xattrs = in->xattrs; + + //if (!(old.inode.dirstat == old.inode.accounted_dirstat)) + //in->dirty_old_dirstats.insert(follows); in->first = follows+1; @@ -1106,7 +1107,8 @@ inode_t *MDCache::journal_dirty_inode(EMetaBlob *metablob, CInode *in, snapid_t */ void MDCache::predirty_journal_parents(Mutation *mut, EMetaBlob *blob, CInode *in, CDir *parent, - int flags, int linkunlink) + int flags, int linkunlink, + snapid_t cfollows) { bool primary_dn = flags & PREDIRTY_PRIMARY; bool do_parent_mtime = flags & PREDIRTY_DIR; @@ -1121,6 +1123,7 @@ void MDCache::predirty_journal_parents(Mutation *mut, EMetaBlob *blob, << " linkunlink=" << linkunlink << (primary_dn ? " primary_dn":" remote_dn") << (shallow ? " SHALLOW":"") + << " follows " << cfollows << " " << *in << dendl; if (!parent) { @@ -1136,9 +1139,6 @@ void MDCache::predirty_journal_parents(Mutation *mut, EMetaBlob *blob, inode_t *curi = in->get_projected_inode(); - __s64 drbytes = 1, drfiles = 0, drsubdirs = 0, dranchors = 0, drsnaprealms = 0; - utime_t rctime; - // build list of inodes to wrlock, dirty, and update list lsi; CInode *cur = in; @@ -1149,11 +1149,6 @@ void MDCache::predirty_journal_parents(Mutation *mut, EMetaBlob *blob, // opportunistically adjust parent dirfrag CInode *pin = parent->get_inode(); - if (do_parent_mtime || linkunlink) { - assert(mut->wrlocks.count(&pin->dirlock) || - mut->is_slave()); // we are slave. master will have wrlocked the dir. - } - // inode -> dirfrag mut->auth_pin(parent); mut->add_projected_fnode(parent); @@ -1161,60 +1156,75 @@ void MDCache::predirty_journal_parents(Mutation *mut, EMetaBlob *blob, fnode_t *pf = parent->project_fnode(); pf->version = parent->pre_dirty(); - if (do_parent_mtime) { - pf->fragstat.mtime = mut->now; - if (mut->now > pf->fragstat.rctime) { - dout(10) << "predirty_journal_parents updating mtime on " << *parent << dendl; - pf->fragstat.rctime = mut->now; - } else { - dout(10) << "predirty_journal_parents updating mtime UNDERWATER on " << *parent << dendl; + if (do_parent_mtime || linkunlink) { + assert(mut->wrlocks.count(&pin->dirlock) || + mut->is_slave()); // we are slave. master will have wrlocked the dir. + + if (do_parent_mtime) { + pf->fragstat.mtime = mut->now; + if (mut->now > pf->rstat.rctime) { + dout(10) << "predirty_journal_parents updating mtime on " << *parent << dendl; + pf->rstat.rctime = mut->now; + } else { + dout(10) << "predirty_journal_parents updating mtime UNDERWATER on " << *parent << dendl; + } + } + if (linkunlink) { + dout(10) << "predirty_journal_parents updating size on " << *parent << dendl; + if (in->is_dir()) { + pf->fragstat.nsubdirs += linkunlink; + pf->rstat.rsubdirs += linkunlink; + } else { + pf->fragstat.nfiles += linkunlink; + pf->rstat.rfiles += linkunlink; + } } } - if (linkunlink) { - dout(10) << "predirty_journal_parents updating size on " << *parent << dendl; - if (in->is_dir()) - pf->fragstat.nsubdirs += linkunlink; - else - pf->fragstat.nfiles += linkunlink; + + + /* + if (follows == CEPH_NOSNAP || follows == 0) + follows = parent->inode->find_snaprealm()->get_latest_snap(); + + // cow fnode? + snapid_t follows = cfollows; + if (follows >= first && + !(pf->fragstat == pf->accounted_fragstat)) { + dout(10) << " cow fnode, follows " << follows << dendl; + dirty_old_fnodes[follows] = parent->get_projected_fnode(); } - if (primary_dn) { + first = follows+1; + */ + // which fnode to write to? + //fnode_t *pf = 0; + /* fixme + if (dirty_old_fnodes.size() && + dirty_old_fnodes.rbegin()->first > follows) { + map::iterator p = dirty_old_fnodes.upper_bound(follows); + dout(10) << " cloning dirty_old_fnode " << p->first << " to follows " << follows << dendl; + dirty_old_fnodes[follows] = p->second; + pf = &p->fragstat; + } + } + */ + //if (!pf) { + + if (primary_dn) { + nest_info_t delta; + delta.zero(); if (linkunlink == 0) { - drbytes = curi->dirstat.rbytes - curi->accounted_dirstat.rbytes; - drfiles = curi->dirstat.rfiles - curi->accounted_dirstat.rfiles; - drsubdirs = curi->dirstat.rsubdirs - curi->accounted_dirstat.rsubdirs; - dranchors = curi->dirstat.ranchors - curi->accounted_dirstat.ranchors; - drsnaprealms = curi->dirstat.rsnaprealms - curi->accounted_dirstat.rsnaprealms; + delta.add(curi->rstat); + delta.sub(curi->accounted_rstat); } else if (linkunlink < 0) { - drbytes = 0 - curi->accounted_dirstat.rbytes; - drfiles = 0 - curi->accounted_dirstat.rfiles; - drsubdirs = 0 - curi->accounted_dirstat.rsubdirs; - dranchors = 0 - curi->accounted_dirstat.ranchors; - drsnaprealms = 0 - curi->accounted_dirstat.rsnaprealms; + delta.sub(curi->accounted_rstat); } else { - drbytes = curi->dirstat.rbytes; - drfiles = curi->dirstat.rfiles; - drsubdirs = curi->dirstat.rsubdirs; - dranchors = curi->dirstat.ranchors; - drsnaprealms = curi->dirstat.rsnaprealms; + delta.add(curi->rstat); } - rctime = MAX(curi->ctime, curi->dirstat.rctime); - - dout(10) << "predirty_journal_parents delta " - << drbytes << " bytes / " << drfiles << " files / " << drsubdirs << " subdirs for " - << *parent << dendl; - pf->fragstat.rbytes += drbytes; - pf->fragstat.rfiles += drfiles; - pf->fragstat.rsubdirs += drsubdirs; - pf->fragstat.ranchors += dranchors; - pf->fragstat.rsnaprealms += drsnaprealms; - pf->fragstat.rctime = rctime; - - curi->accounted_dirstat = curi->dirstat; - } else { - dout(10) << "predirty_journal_parents no delta (remote dentry, or rename within same dir) in " << *parent << dendl; - pf->fragstat.rfiles += linkunlink; - } + dout(10) << "predirty_journal_parents delta " << delta << " " << *parent << dendl; + pf->rstat.add(delta); + curi->accounted_rstat = curi->rstat; + } // stop? if (pin->is_base()) @@ -1226,23 +1236,26 @@ void MDCache::predirty_journal_parents(Mutation *mut, EMetaBlob *blob, stop = true; } if (!stop && - mut->wrlocks.count(&pin->dirlock) == 0 && + mut->wrlocks.count(&pin->nestlock) == 0 && (!pin->can_auth_pin() || !pin->versionlock.can_wrlock() || // make sure we can take versionlock, too - !mds->locker->scatter_wrlock_try(&pin->dirlock, mut, false))) { // ** do not initiate.. see above comment ** - dout(10) << "predirty_journal_parents can't wrlock one of " << pin->versionlock << " or " << pin->dirlock + !mds->locker->scatter_wrlock_try(&pin->nestlock, mut, false))) { // ** do not initiate.. see above comment ** + dout(10) << "predirty_journal_parents can't wrlock one of " << pin->versionlock << " or " << pin->nestlock << " on " << *pin << dendl; stop = true; } if (stop) { - dout(10) << "predirty_journal_parents stop. marking dirlock on " << *pin << dendl; - mds->locker->mark_updated_scatterlock(&pin->dirlock); - mut->ls->dirty_dirfrag_dir.push_back(&pin->xlist_dirty_dirfrag_dir); - mut->add_updated_scatterlock(&pin->dirlock); + dout(10) << "predirty_journal_parents stop. marking nestlock on " << *pin << dendl; + mds->locker->mark_updated_scatterlock(&pin->nestlock); + mut->ls->dirty_dirfrag_nest.push_back(&pin->xlist_dirty_dirfrag_nest); + mut->add_updated_scatterlock(&pin->nestlock); break; } mds->locker->local_wrlock_grab(&pin->versionlock, mut); + assert(mut->wrlocks.count(&pin->nestlock) || + mut->is_slave()); + // dirfrag -> diri mut->auth_pin(pin); mut->add_projected_inode(pin); @@ -1250,14 +1263,27 @@ void MDCache::predirty_journal_parents(Mutation *mut, EMetaBlob *blob, inode_t *pi = pin->project_inode(); pi->version = pin->pre_dirty(); - pi->dirstat.version++; - dout(15) << "predirty_journal_parents take_diff " << pf->fragstat << dendl; - dout(15) << "predirty_journal_parents - " << pf->accounted_fragstat << dendl; - bool touched_mtime = false; - pi->dirstat.take_diff(pf->fragstat, pf->accounted_fragstat, touched_mtime); - if (touched_mtime) - pi->mtime = pi->ctime = pi->dirstat.mtime; - dout(15) << "predirty_journal_parents gives " << pi->dirstat << " on " << *pin << dendl; + + // dirstat + if (do_parent_mtime || linkunlink) { + pi->dirstat.version++; + dout(15) << "predirty_journal_parents take_diff " << pf->fragstat << dendl; + dout(15) << "predirty_journal_parents - " << pf->accounted_fragstat << dendl; + bool touched_mtime = false; + pi->dirstat.take_diff(pf->fragstat, pf->accounted_fragstat, touched_mtime); + if (touched_mtime) + pi->mtime = pi->ctime = pi->dirstat.mtime; + dout(15) << "predirty_journal_parents gives " << pi->dirstat << " on " << *pin << dendl; + } + + // rstat + if (primary_dn) { + pi->rstat.version++; + dout(15) << "predirty_journal_parents take_diff " << pf->rstat << dendl; + dout(15) << "predirty_journal_parents - " << pf->accounted_rstat << dendl; + pi->rstat.take_diff(pf->rstat, pf->accounted_rstat); + dout(15) << "predirty_journal_parents gives " << pi->rstat << " on " << *pin << dendl; + } // next parent! cur = pin; @@ -2186,7 +2212,10 @@ void MDCache::rejoin_send_rejoins() root->linklock.get_state(), root->dirfragtreelock.get_state(), root->filelock.get_state(), - root->dirlock.get_state()); + root->dirlock.get_state(), + root->nestlock.get_state(), + root->snaplock.get_state(), + root->xattrlock.get_state()); } if (CInode *in = get_inode(MDS_INO_STRAY(p->first))) { p->second->add_weak_inode(in->ino()); @@ -2196,7 +2225,10 @@ void MDCache::rejoin_send_rejoins() in->linklock.get_state(), in->dirfragtreelock.get_state(), in->filelock.get_state(), - in->dirlock.get_state()); + in->dirlock.get_state(), + in->nestlock.get_state(), + in->snaplock.get_state(), + in->xattrlock.get_state()); } } } @@ -2326,7 +2358,10 @@ void MDCache::rejoin_walk(CDir *dir, MMDSCacheRejoin *rejoin) in->linklock.get_state(), in->dirfragtreelock.get_state(), in->filelock.get_state(), - in->dirlock.get_state()); + in->dirlock.get_state(), + in->nestlock.get_state(), + in->snaplock.get_state(), + in->xattrlock.get_state()); in->get_nested_dirfrags(nested); } } @@ -2519,7 +2554,10 @@ void MDCache::handle_cache_rejoin_weak(MMDSCacheRejoin *weak) in->linklock.get_replica_state(), in->dirfragtreelock.get_replica_state(), in->filelock.get_replica_state(), - in->dirlock.get_replica_state()); + in->dirlock.get_replica_state(), + in->nestlock.get_replica_state(), + in->snaplock.get_replica_state(), + in->xattrlock.get_replica_state()); } } } @@ -2543,7 +2581,10 @@ void MDCache::handle_cache_rejoin_weak(MMDSCacheRejoin *weak) in->linklock.get_replica_state(), in->dirfragtreelock.get_replica_state(), in->filelock.get_replica_state(), - in->dirlock.get_replica_state()); + in->dirlock.get_replica_state(), + in->nestlock.get_replica_state(), + in->snaplock.get_replica_state(), + in->xattrlock.get_replica_state()); } if (survivor) { @@ -3223,7 +3264,10 @@ void MDCache::rejoin_send_acks() in->linklock.get_replica_state(), in->dirfragtreelock.get_replica_state(), in->filelock.get_replica_state(), - in->dirlock.get_replica_state()); + in->dirlock.get_replica_state(), + in->nestlock.get_replica_state(), + in->snaplock.get_replica_state(), + in->xattrlock.get_replica_state()); } // subdirs in this subtree? @@ -3243,7 +3287,10 @@ void MDCache::rejoin_send_acks() root->linklock.get_replica_state(), root->dirfragtreelock.get_replica_state(), root->filelock.get_replica_state(), - root->dirlock.get_replica_state()); + root->dirlock.get_replica_state(), + root->nestlock.get_replica_state(), + root->snaplock.get_replica_state(), + root->xattrlock.get_replica_state()); } if (stray) for (map::iterator r = stray->replicas_begin(); @@ -3255,7 +3302,10 @@ void MDCache::rejoin_send_acks() stray->linklock.get_replica_state(), stray->dirfragtreelock.get_replica_state(), stray->filelock.get_replica_state(), - stray->dirlock.get_replica_state()); + stray->dirlock.get_replica_state(), + stray->nestlock.get_replica_state(), + stray->snaplock.get_replica_state(), + stray->xattrlock.get_replica_state()); } // send acks @@ -5445,11 +5495,11 @@ void MDCache::_anchor_prepared(CInode *in, version_t atid, bool add) inode_t *pi = in->project_inode(); if (add) { pi->anchored = true; - pi->dirstat.ranchors++; + pi->rstat.ranchors++; in->parent->adjust_nested_anchors(1); } else { pi->anchored = false; - pi->dirstat.ranchors--; + pi->rstat.ranchors--; in->parent->adjust_nested_anchors(-1); } pi->version = in->pre_dirty(); @@ -5529,7 +5579,7 @@ void MDCache::snaprealm_create(MDRequest *mdr, CInode *in) inode_t *pi = in->project_inode(); pi->version = in->pre_dirty(); - pi->dirstat.rsnaprealms++; + pi->rstat.rsnaprealms++; SnapRealm t(this, in); t.created = mdr->more()->stid; @@ -7030,6 +7080,11 @@ void MDCache::fragment_stored(MDRequest *mdr) mdr->ls->dirty_dirfrag_dir.push_back(&diri->xlist_dirty_dirfrag_dir); mdr->add_updated_scatterlock(&diri->dirlock); + // dirlock + mds->locker->mark_updated_scatterlock(&diri->nestlock); + mdr->ls->dirty_dirfrag_nest.push_back(&diri->xlist_dirty_dirfrag_nest); + mdr->add_updated_scatterlock(&diri->nestlock); + // journal new dirfrag fragstats for each new fragment. for (list::iterator p = resultfrags.begin(); p != resultfrags.end(); diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h index 98489a1de65f6..edab308bd9d5a 100644 --- a/src/mds/MDCache.h +++ b/src/mds/MDCache.h @@ -344,7 +344,7 @@ struct MDSlaveUpdate { // flags for predirty_journal_parents() static const int PREDIRTY_PRIMARY = 1; // primary dn, adjust nested accounting static const int PREDIRTY_DIR = 2; // update parent dir mtime/size -static const int PREDIRTY_SHALLOW = 4; // only go to immediate parrent (for easier rollback) +static const int PREDIRTY_SHALLOW = 4; // only go to immediate parent (for easier rollback) class MDCache { @@ -484,7 +484,8 @@ public: inode_t *journal_dirty_inode(EMetaBlob *metablob, CInode *in, snapid_t follows=CEPH_NOSNAP); void predirty_journal_parents(Mutation *mut, EMetaBlob *blob, CInode *in, CDir *parent, - int flags, int linkunlink=0); + int flags, int linkunlink=0, + snapid_t follows=CEPH_NOSNAP); // slaves void add_uncommitted_master(metareqid_t reqid, LogSegment *ls, set &slaves) { diff --git a/src/mds/Server.cc b/src/mds/Server.cc index 37c604efebe8a..5c8c487e290ef 100644 --- a/src/mds/Server.cc +++ b/src/mds/Server.cc @@ -1495,6 +1495,7 @@ CDentry* Server::rdlock_path_xlock_dentry(MDRequest *mdr, bool okexist, bool mus else rdlocks.insert(&dn->lock); // existing dn, rdlock wrlocks.insert(&dn->dir->inode->dirlock); // also, wrlock on dir mtime + wrlocks.insert(&dn->dir->inode->nestlock); // also, wrlock on dir mtime if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks)) return 0; @@ -2068,7 +2069,7 @@ void Server::handle_client_mknod(MDRequest *mdr) if ((newi->inode.mode & S_IFMT) == 0) newi->inode.mode |= S_IFREG; newi->inode.version = dn->pre_dirty() - 1; - newi->inode.dirstat.rfiles = 1; + newi->inode.rstat.rfiles = 1; newi->projected_parent = dn; dn->first = newi->first = follows+1; @@ -2113,7 +2114,7 @@ void Server::handle_client_mkdir(MDRequest *mdr) newi->inode.mode |= S_IFDIR; newi->inode.layout = g_default_mds_dir_layout; newi->inode.version = dn->pre_dirty() - 1; - newi->inode.dirstat.rsubdirs = 1; + newi->inode.rstat.rsubdirs = 1; dn->first = newi->first = follows+1; @@ -2161,7 +2162,7 @@ void Server::handle_client_symlink(MDRequest *mdr) newi->symlink = req->get_path2(); newi->inode.size = newi->symlink.length(); newi->inode.version = dn->pre_dirty() - 1; - newi->inode.dirstat.rfiles = 1; + newi->inode.rstat.rfiles = 1; dn->first = newi->first = follows+1; @@ -2236,6 +2237,7 @@ void Server::handle_client_link(MDRequest *mdr) rdlocks.insert(&linktrace[i]->lock); xlocks.insert(&dn->lock); wrlocks.insert(&dn->dir->inode->dirlock); + wrlocks.insert(&dn->dir->inode->nestlock); for (int i=0; i<(int)targettrace.size(); i++) rdlocks.insert(&targettrace[i]->lock); xlocks.insert(&targeti->linklock); @@ -2543,7 +2545,7 @@ void Server::handle_slave_link_prep(MDRequest *mdr) rollback.old_ctime = targeti->inode.ctime; // we hold versionlock; no concorrent projections fnode_t *pf = targeti->get_parent_dn()->get_dir()->get_projected_fnode(); rollback.old_dir_mtime = pf->fragstat.mtime; - rollback.old_dir_rctime = pf->fragstat.rctime; + rollback.old_dir_rctime = pf->rstat.rctime; rollback.was_inc = inc; ::encode(rollback, le->rollback); mdr->more()->rollback_bl = le->rollback; @@ -2676,9 +2678,10 @@ void Server::do_link_rollback(bufferlist &rbl, int master, MDRequest *mdr) pf->version = parent->pre_dirty(); if (pf->fragstat.mtime == pi->ctime) { pf->fragstat.mtime = rollback.old_dir_mtime; - if (pf->fragstat.rctime == pi->ctime) - pf->fragstat.rctime = rollback.old_dir_rctime; + if (pf->rstat.rctime == pi->ctime) + pf->rstat.rctime = rollback.old_dir_rctime; mut->add_updated_scatterlock(&parent->get_inode()->dirlock); + mut->add_updated_scatterlock(&parent->get_inode()->nestlock); } // inode @@ -2825,9 +2828,12 @@ void Server::handle_client_unlink(MDRequest *mdr) rdlocks.insert(&trace[i]->lock); xlocks.insert(&dn->lock); wrlocks.insert(&dn->dir->inode->dirlock); + wrlocks.insert(&dn->dir->inode->nestlock); xlocks.insert(&in->linklock); - if (straydn) + if (straydn) { wrlocks.insert(&straydn->dir->inode->dirlock); + wrlocks.insert(&straydn->dir->inode->nestlock); + } if (in->is_dir()) rdlocks.insert(&in->dirlock); // to verify it's empty @@ -3214,20 +3220,24 @@ void Server::handle_client_rename(MDRequest *mdr) set rdlocks, wrlocks, xlocks; // straydn? - if (straydn) + if (straydn) { wrlocks.insert(&straydn->dir->inode->dirlock); + wrlocks.insert(&straydn->dir->inode->nestlock); + } // rdlock sourcedir path, xlock src dentry for (int i=0; i<(int)srctrace.size()-1; i++) rdlocks.insert(&srctrace[i]->lock); xlocks.insert(&srcdn->lock); wrlocks.insert(&srcdn->dir->inode->dirlock); + wrlocks.insert(&srcdn->dir->inode->nestlock); // rdlock destdir path, xlock dest dentry for (int i=0; i<(int)desttrace.size(); i++) rdlocks.insert(&desttrace[i]->lock); xlocks.insert(&destdn->lock); wrlocks.insert(&destdn->dir->inode->dirlock); + wrlocks.insert(&destdn->dir->inode->nestlock); // xlock versionlock on srci if remote? // this ensures it gets safely remotely auth_pinned, avoiding deadlock; @@ -3349,7 +3359,7 @@ void Server::handle_client_rename(MDRequest *mdr) if (srcdn->is_primary() && (srcdn->inode->is_anchored() || - (srcdn->inode->is_dir() && (srcdn->inode->inode.dirstat.ranchors || + (srcdn->inode->is_dir() && (srcdn->inode->inode.rstat.ranchors || srcdn->inode->nested_anchors || !mdcache->is_leaf_subtree(mdcache->get_subtree_root(srcdn->dir))))) && !mdr->more()->src_reanchor_atid) { @@ -3501,6 +3511,7 @@ version_t Server::_rename_prepare_import(MDRequest *mdr, CDentry *srcdn, bufferl mdr->ls, mdr->more()->cap_imports, updated_scatterlocks); srcdn->inode->dirlock.clear_updated(); + srcdn->inode->nestlock.clear_updated(); // hack: force back to !auth and clean, temporarily srcdn->inode->state_clear(CInode::STATE_AUTH); @@ -3599,13 +3610,13 @@ void Server::_rename_prepare(MDRequest *mdr, // sub off target if (destdn->is_auth() && !destdn->is_null()) mdcache->predirty_journal_parents(mdr, metablob, destdn->inode, destdn->dir, - (destdn->is_primary() ? PREDIRTY_PRIMARY:0)|predirty_dir, -1); + (destdn->is_primary() ? PREDIRTY_PRIMARY:0)|predirty_dir, -1); // move srcdn int predirty_primary = (srcdn->is_primary() && srcdn->dir != destdn->dir) ? PREDIRTY_PRIMARY:0; int flags = predirty_dir | predirty_primary; if (srcdn->is_auth()) - mdcache->predirty_journal_parents(mdr, metablob, srcdn->inode, srcdn->dir, flags, -1); + mdcache->predirty_journal_parents(mdr, metablob, srcdn->inode, srcdn->dir, PREDIRTY_SHALLOW|flags, -1); if (destdn->is_auth()) mdcache->predirty_journal_parents(mdr, metablob, srcdn->inode, destdn->dir, flags, 1); @@ -3907,7 +3918,7 @@ void Server::handle_slave_rename_prep(MDRequest *mdr) rollback.orig_src.dirfrag = srcdn->dir->dirfrag(); rollback.orig_src.dirfrag_old_mtime = srcdn->dir->get_projected_fnode()->fragstat.mtime; - rollback.orig_src.dirfrag_old_rctime = srcdn->dir->get_projected_fnode()->fragstat.rctime; + rollback.orig_src.dirfrag_old_rctime = srcdn->dir->get_projected_fnode()->rstat.rctime; rollback.orig_src.dname = srcdn->name; if (srcdn->is_primary()) rollback.orig_src.ino = srcdn->inode->ino(); @@ -3919,7 +3930,7 @@ void Server::handle_slave_rename_prep(MDRequest *mdr) rollback.orig_dest.dirfrag = destdn->dir->dirfrag(); rollback.orig_dest.dirfrag_old_mtime = destdn->dir->get_projected_fnode()->fragstat.mtime; - rollback.orig_dest.dirfrag_old_rctime = destdn->dir->get_projected_fnode()->fragstat.rctime; + rollback.orig_dest.dirfrag_old_rctime = destdn->dir->get_projected_fnode()->rstat.rctime; rollback.orig_dest.dname = destdn->name; if (destdn->is_primary()) rollback.orig_dest.ino = destdn->inode->ino(); @@ -3931,7 +3942,7 @@ void Server::handle_slave_rename_prep(MDRequest *mdr) if (straydn) { rollback.stray.dirfrag = straydn->dir->dirfrag(); rollback.stray.dirfrag_old_mtime = straydn->dir->get_projected_fnode()->fragstat.mtime; - rollback.stray.dirfrag_old_rctime = straydn->dir->get_projected_fnode()->fragstat.rctime; + rollback.stray.dirfrag_old_rctime = straydn->dir->get_projected_fnode()->rstat.rctime; rollback.stray.dname = straydn->name; } ::encode(rollback, mdr->more()->rollback_bl); @@ -4066,7 +4077,7 @@ void Server::_commit_slave_rename(MDRequest *mdr, int r, } void _rollback_repair_dir(Mutation *mut, CDir *dir, rename_rollback::drec &r, utime_t ctime, - bool isdir, int linkunlink, bool primary, frag_info_t &dirstat) + bool isdir, int linkunlink, bool primary, frag_info_t &dirstat, nest_info_t &rstat) { fnode_t *pf; if (dir->is_auth()) { @@ -4078,22 +4089,24 @@ void _rollback_repair_dir(Mutation *mut, CDir *dir, rename_rollback::drec &r, ut if (isdir) { pf->fragstat.nsubdirs += linkunlink; - pf->fragstat.rsubdirs += linkunlink; + pf->rstat.rsubdirs += linkunlink; } else { pf->fragstat.nfiles += linkunlink; - pf->fragstat.rfiles += linkunlink; + pf->rstat.rfiles += linkunlink; } if (primary) { - pf->fragstat.rbytes += linkunlink * dirstat.rbytes; - pf->fragstat.rfiles += linkunlink * dirstat.rfiles; - pf->fragstat.rsubdirs += linkunlink * dirstat.rsubdirs; - pf->fragstat.ranchors += linkunlink * dirstat.ranchors; + pf->rstat.rbytes += linkunlink * rstat.rbytes; + pf->rstat.rfiles += linkunlink * rstat.rfiles; + pf->rstat.rsubdirs += linkunlink * rstat.rsubdirs; + pf->rstat.ranchors += linkunlink * rstat.ranchors; + pf->rstat.rsnaprealms += linkunlink * rstat.rsnaprealms; } if (pf->fragstat.mtime == ctime) { pf->fragstat.mtime = r.dirfrag_old_mtime; - if (pf->fragstat.rctime == ctime) - pf->fragstat.rctime = r.dirfrag_old_rctime; + if (pf->rstat.rctime == ctime) + pf->rstat.rctime = r.dirfrag_old_rctime; mut->add_updated_scatterlock(&dir->get_inode()->dirlock); + mut->add_updated_scatterlock(&dir->get_inode()->nestlock); } } @@ -4187,7 +4200,7 @@ void Server::do_rename_rollback(bufferlist &rbl, int master, MDRequest *mdr) pi->ctime = rollback.orig_src.old_ctime; _rollback_repair_dir(mut, srcdir, rollback.orig_src, rollback.ctime, - in->is_dir(), 1, srcdn->is_primary(), pi->dirstat); + in->is_dir(), 1, srcdn->is_primary(), pi->dirstat, pi->rstat); // repair dest CInode *target = 0; @@ -4214,16 +4227,17 @@ void Server::do_rename_rollback(bufferlist &rbl, int master, MDRequest *mdr) } if (target) _rollback_repair_dir(mut, destdir, rollback.orig_dest, rollback.ctime, - target->is_dir(), 0, destdn->is_primary(), ti->dirstat); + target->is_dir(), 0, destdn->is_primary(), ti->dirstat, ti->rstat); else { frag_info_t blah; - _rollback_repair_dir(mut, destdir, rollback.orig_dest, rollback.ctime, 0, -1, 0, blah); + nest_info_t blah2; + _rollback_repair_dir(mut, destdir, rollback.orig_dest, rollback.ctime, 0, -1, 0, blah, blah2); } // repair stray if (straydir) _rollback_repair_dir(mut, straydir, rollback.stray, rollback.ctime, - target->is_dir(), -1, true, ti->dirstat); + target->is_dir(), -1, true, ti->dirstat, ti->rstat); dout(-10) << " srcdn back to " << *srcdn << dendl; dout(-10) << " srci back to " << *srcdn->inode << dendl; @@ -4703,7 +4717,7 @@ void Server::handle_client_openc(MDRequest *mdr) in->inode.mode |= S_IFREG; in->inode.version = dn->pre_dirty() - 1; in->inode.max_size = in->get_layout_size_increment(); - in->inode.dirstat.rfiles = 1; + in->inode.rstat.rfiles = 1; in->projected_parent = dn; dn->first = in->first = follows+1; diff --git a/src/mds/SimpleLock.h b/src/mds/SimpleLock.h index 083f8038d0fb3..f412e4e303e53 100644 --- a/src/mds/SimpleLock.h +++ b/src/mds/SimpleLock.h @@ -28,6 +28,7 @@ inline const char *get_lock_type_name(int t) { case CEPH_LOCK_ILINK: return "ilink"; case CEPH_LOCK_IDFT: return "idft"; case CEPH_LOCK_IDIR: return "idir"; + case CEPH_LOCK_INEST: return "inest"; case CEPH_LOCK_IXATTR: return "ixattr"; case CEPH_LOCK_ISNAP: return "isnap"; case CEPH_LOCK_INO: return "ino"; diff --git a/src/mds/journal.cc b/src/mds/journal.cc index f9bb7bc4f4854..1dbabe57cc730 100644 --- a/src/mds/journal.cc +++ b/src/mds/journal.cc @@ -139,6 +139,12 @@ C_Gather *LogSegment::try_to_expire(MDS *mds) if (!gather) gather = new C_Gather; mds->locker->scatter_nudge(&in->dirfragtreelock, gather->new_sub()); } + for (xlist::iterator p = dirty_dirfrag_nest.begin(); !p.end(); ++p) { + CInode *in = *p; + dout(10) << "try_to_expire waiting for nest flush on " << *in << dendl; + if (!gather) gather = new C_Gather; + mds->locker->scatter_nudge(&in->nestlock, gather->new_sub()); + } // open files if (!open_files.empty()) { diff --git a/src/mds/mdstypes.h b/src/mds/mdstypes.h index c0751c8c22cf6..93deae3ce5b91 100644 --- a/src/mds/mdstypes.h +++ b/src/mds/mdstypes.h @@ -83,6 +83,51 @@ struct frag_info_t { __s64 nsubdirs; // subdirs __s64 size() const { return nfiles + nsubdirs; } + void zero() { + memset(this, 0, sizeof(*this)); + } + + // *this += cur - acc; acc = cur + void take_diff(const frag_info_t &cur, frag_info_t &acc, bool& touched_mtime) { + if (!(cur.mtime == acc.mtime)) { + mtime = cur.mtime; + touched_mtime = true; + } + nfiles += cur.nfiles - acc.nfiles; + nsubdirs += cur.nsubdirs - acc.nsubdirs; + acc = cur; + acc.version = version; + } + + void encode(bufferlist &bl) const { + ::encode(version, bl); + ::encode(mtime, bl); + ::encode(nfiles, bl); + ::encode(nsubdirs, bl); + } + void decode(bufferlist::iterator &bl) { + ::decode(version, bl); + ::decode(mtime, bl); + ::decode(nfiles, bl); + ::decode(nsubdirs, bl); + } +}; +WRITE_CLASS_ENCODER(frag_info_t) + +inline bool operator==(const frag_info_t &l, const frag_info_t &r) { + return memcmp(&l, &r, sizeof(l)) == 0; +} + +inline ostream& operator<<(ostream &out, const frag_info_t &f) { + return out << "f(v" << f.version + << " m" << f.mtime + << " " << f.size() << "=" << f.nfiles << "+" << f.nsubdirs + << ")"; +} + +struct nest_info_t { + version_t version; + // this frag + children utime_t rctime; __s64 rbytes; @@ -96,14 +141,22 @@ struct frag_info_t { void zero() { memset(this, 0, sizeof(*this)); } - void take_diff(const frag_info_t &cur, frag_info_t &acc, bool& touched_mtime) { - if (cur.mtime > mtime) { - rctime = mtime = cur.mtime; - touched_mtime = true; - } - nfiles += cur.nfiles - acc.nfiles; - nsubdirs += cur.nsubdirs - acc.nsubdirs; + void sub(const nest_info_t &other) { + add(other, -1); + } + void add(const nest_info_t &other, int fac=1) { + if (other.rctime > rctime) + rctime = other.rctime; + rbytes += fac*other.rbytes; + rfiles += fac*other.rfiles; + rsubdirs += fac*other.rsubdirs; + ranchors += fac*other.ranchors; + rsnaprealms += fac*other.rsnaprealms; + } + + // *this += cur - acc; acc = cur + void take_diff(const nest_info_t &cur, nest_info_t &acc) { if (cur.rctime > rctime) rctime = cur.rctime; rbytes += cur.rbytes - acc.rbytes; @@ -117,9 +170,6 @@ struct frag_info_t { void encode(bufferlist &bl) const { ::encode(version, bl); - ::encode(mtime, bl); - ::encode(nfiles, bl); - ::encode(nsubdirs, bl); ::encode(rbytes, bl); ::encode(rfiles, bl); ::encode(rsubdirs, bl); @@ -129,9 +179,6 @@ struct frag_info_t { } void decode(bufferlist::iterator &bl) { ::decode(version, bl); - ::decode(mtime, bl); - ::decode(nfiles, bl); - ::decode(nsubdirs, bl); ::decode(rbytes, bl); ::decode(rfiles, bl); ::decode(rsubdirs, bl); @@ -140,21 +187,19 @@ struct frag_info_t { ::decode(rctime, bl); } }; -WRITE_CLASS_ENCODER(frag_info_t) +WRITE_CLASS_ENCODER(nest_info_t) -inline bool operator==(const frag_info_t &l, const frag_info_t &r) { +inline bool operator==(const nest_info_t &l, const nest_info_t &r) { return memcmp(&l, &r, sizeof(l)) == 0; } -inline ostream& operator<<(ostream &out, const frag_info_t &f) { - return out << "f(v" << f.version - << " m" << f.mtime - << " " << f.size() << "=" << f.nfiles << "+" << f.nsubdirs - << " rc" << f.rctime - << " b" << f.rbytes - << " a" << f.ranchors - << " sr" << f.rsnaprealms - << " " << f.rsize() << "=" << f.rfiles << "+" << f.rsubdirs +inline ostream& operator<<(ostream &out, const nest_info_t &n) { + return out << "n(v" << n.version + << " rc" << n.rctime + << " b" << n.rbytes + << " a" << n.ranchors + << " sr" << n.rsnaprealms + << " " << n.rsize() << "=" << n.rfiles << "+" << n.rsubdirs << ")"; } @@ -218,8 +263,8 @@ struct inode_t { uint64_t time_warp_seq; // count of (potential) mtime/atime timewarps (i.e., utimes()) // dirfrag, recursive accounting - frag_info_t dirstat; - frag_info_t accounted_dirstat; // what dirfrag has seen + frag_info_t dirstat; + nest_info_t rstat, accounted_rstat; // special stuff version_t version; // auth only @@ -250,7 +295,8 @@ struct inode_t { ::encode(time_warp_seq, bl); ::encode(dirstat, bl); - ::encode(accounted_dirstat, bl); + ::encode(rstat, bl); + ::encode(accounted_rstat, bl); ::encode(version, bl); ::encode(file_data_version, bl); @@ -275,7 +321,8 @@ struct inode_t { ::decode(time_warp_seq, p); ::decode(dirstat, p); - ::decode(accounted_dirstat, p); + ::decode(rstat, p); + ::decode(accounted_rstat, p); ::decode(version, p); ::decode(file_data_version, p); @@ -309,16 +356,21 @@ WRITE_CLASS_ENCODER(old_inode_t) struct fnode_t { version_t version; frag_info_t fragstat, accounted_fragstat; + nest_info_t rstat, accounted_rstat; void encode(bufferlist &bl) const { ::encode(version, bl); ::encode(fragstat, bl); ::encode(accounted_fragstat, bl); + ::encode(rstat, bl); + ::encode(accounted_rstat, bl); } void decode(bufferlist::iterator &bl) { ::decode(version, bl); ::decode(fragstat, bl); ::decode(accounted_fragstat, bl); + ::decode(rstat, bl); + ::decode(accounted_rstat, bl); } }; WRITE_CLASS_ENCODER(fnode_t) diff --git a/src/messages/MClientReply.h b/src/messages/MClientReply.h index b8883ac5ea73d..b094f1be9d994 100644 --- a/src/messages/MClientReply.h +++ b/src/messages/MClientReply.h @@ -95,6 +95,7 @@ struct InodeStat { version_t time_warp_seq; frag_info_t dirstat; + nest_info_t rstat; string symlink; // symlink content (if symlink) fragtree_t dirfragtree; @@ -128,10 +129,11 @@ struct InodeStat { memset(&dirstat, 0, sizeof(dirstat)); dirstat.nfiles = e.files; dirstat.nsubdirs = e.subdirs; - dirstat.rctime.decode_timeval(&e.rctime); - dirstat.rbytes = e.rbytes; - dirstat.rfiles = e.rfiles; - dirstat.rsubdirs = e.rsubdirs; + + rstat.rctime.decode_timeval(&e.rctime); + rstat.rbytes = e.rbytes; + rstat.rfiles = e.rfiles; + rstat.rsubdirs = e.rsubdirs; int n = e.fragtree.nsplits; while (n) { diff --git a/src/messages/MMDSCacheRejoin.h b/src/messages/MMDSCacheRejoin.h index 7928d580bf34a..1e364f2d7c7f7 100644 --- a/src/messages/MMDSCacheRejoin.h +++ b/src/messages/MMDSCacheRejoin.h @@ -48,12 +48,12 @@ class MMDSCacheRejoin : public Message { int32_t linklock; int32_t dirfragtreelock; int32_t filelock; - int32_t dirlock; + int32_t dirlock, nestlock, snaplock, xattrlock; inode_strong() {} - inode_strong(int n, int cw=0, int a=0, int l=0, int dft=0, int f=0, int dl=0) : + inode_strong(int n, int cw=0, int a=0, int l=0, int dft=0, int f=0, int dl=0, int nl=0, int snl=0, int xal=0) : caps_wanted(cw), nonce(n), - authlock(a), linklock(l), dirfragtreelock(dft), filelock(f), dirlock(dl) { } + authlock(a), linklock(l), dirfragtreelock(dft), filelock(f), dirlock(dl), nestlock(nl), snaplock(snl), xattrlock(xal) { } void encode(bufferlist &bl) const { ::encode(caps_wanted, bl); ::encode(nonce, bl); @@ -62,6 +62,9 @@ class MMDSCacheRejoin : public Message { ::encode(dirfragtreelock, bl); ::encode(filelock, bl); ::encode(dirlock, bl); + ::encode(nestlock, bl); + ::encode(snaplock, bl); + ::encode(xattrlock, bl); } void decode(bufferlist::iterator &bl) { ::decode(caps_wanted, bl); @@ -71,6 +74,9 @@ class MMDSCacheRejoin : public Message { ::decode(dirfragtreelock, bl); ::decode(filelock, bl); ::decode(dirlock, bl); + ::decode(nestlock, bl); + ::decode(snaplock, bl); + ::decode(xattrlock, bl); } }; WRITE_CLASS_ENCODER(inode_strong) @@ -197,8 +203,8 @@ class MMDSCacheRejoin : public Message { void add_weak_inode(inodeno_t i) { weak_inodes.insert(i); } - void add_strong_inode(inodeno_t i, int n, int cw, int a, int l, int dft, int f, int dl) { - strong_inodes[i] = inode_strong(n, cw, a, l, dft, f, dl); + void add_strong_inode(inodeno_t i, int n, int cw, int a, int l, int dft, int f, int dl, int nl, int snl, int xl) { + strong_inodes[i] = inode_strong(n, cw, a, l, dft, f, dl, nl, snl, xl); } void add_full_inode(inode_t &i, const string& s, const fragtree_t &f) { full_inodes.push_back(inode_full(i, s, f)); -- 2.39.5