- some sort of tester for PaxosService...
- osdmon needs to lower-bound old osdmap versions it keeps around?
+mds nested
+- fix rejoin vs updated dirfrag nested/dirlocks
+
mds mustfix
- replay of dir fragmentation (dont want dir frozen, pins, etc.?)
- fix rm -r vs mds exports
utime_t rctime; // \max_{children}(ctime, nested_ctime)
__u64 rbytes;
__u64 rfiles;
+ __u64 rsubdirs;
void encode(bufferlist &bl) const {
::encode(rbytes, bl);
::encode(rfiles, bl);
+ ::encode(rsubdirs, bl);
::encode(rctime, bl);
}
void decode(bufferlist::iterator &bl) {
::decode(rbytes, bl);
::decode(rfiles, bl);
+ ::decode(rsubdirs, bl);
::decode(rctime, bl);
}
};
/*
* like an inode, but for a dir frag
*/
-struct fnode_t {
- version_t version;
+struct frag_info_t {
utime_t mtime;
- __u64 size; // files + dirs
- __u64 nprimary, nremote;
__u64 nfiles; // files
__u64 nsubdirs; // subdirs
- nested_info_t nested; // nested summation
- nested_info_t accounted_nested; // nested summation
+ __u64 size() { return nfiles + nsubdirs; }
+
void encode(bufferlist &bl) const {
- ::encode(version, bl);
- ::encode(size, bl);
- ::encode(nprimary, bl);
- ::encode(nremote, bl);
+ ::encode(mtime, bl);
+ //::encode(size, bl);
::encode(nfiles, bl);
::encode(nsubdirs, bl);
+ }
+ void decode(bufferlist::iterator &bl) {
+ ::decode(mtime, bl);
+ //::decode(size, bl);
+ ::decode(nfiles, bl);
+ ::decode(nsubdirs, bl);
+ }
+};
+WRITE_CLASS_ENCODER(frag_info_t)
+
+struct fnode_t {
+ version_t version;
+ frag_info_t fraginfo, accounted_fraginfo; // this dir
+ nested_info_t nested, accounted_nested; // this dir + sum over children.
+
+ void encode(bufferlist &bl) const {
+ ::encode(version, bl);
+ ::encode(fraginfo, bl);
+ ::encode(accounted_fraginfo, bl);
::encode(nested, bl);
::encode(accounted_nested, bl);
}
void decode(bufferlist::iterator &bl) {
::decode(version, bl);
- ::decode(size, bl);
- ::decode(nprimary, bl);
- ::decode(nremote, bl);
- ::decode(nfiles, bl);
- ::decode(nsubdirs, bl);
+ ::decode(fraginfo, bl);
+ ::decode(accounted_fraginfo, bl);
::decode(nested, bl);
::decode(accounted_nested, bl);
}
if (dir.state_test(CDir::STATE_EXPORTBOUND)) out << "|exportbound";
if (dir.state_test(CDir::STATE_IMPORTBOUND)) out << "|importbound";
- out << " s=" << dir.fnode.size;
- out << " rb=" << dir.fnode.nested.rbytes;
- out << " rf=" << dir.fnode.nested.rfiles;
+ out << " s=" << dir.fnode.fraginfo.size()
+ << "=" << dir.fnode.fraginfo.nfiles
+ << "+" << dir.fnode.fraginfo.nsubdirs;
+ out << " rb=" << dir.fnode.nested.rbytes << "/" << dir.fnode.accounted_nested.rbytes;
+ out << " rf=" << dir.fnode.nested.rfiles << "/" << dir.fnode.accounted_nested.rfiles;
+ out << " rd=" << dir.fnode.nested.rsubdirs << "/" << dir.fnode.accounted_nested.rsubdirs;
out << " sz=" << dir.get_nitems() << "+" << dir.get_nnull();
if (dir.get_num_dirty())
nnull++;
else {
nitems++;
- fnode.size++;
if (dn->is_primary()) {
- fnode.nprimary++;
fnode.nested.rbytes += dn->get_inode()->inode.accounted_nested.rbytes;
fnode.nested.rfiles += dn->get_inode()->inode.accounted_nested.rfiles;
- } else {
- fnode.nremote++;
+ if (dn->get_inode()->is_dir())
+ fnode.fraginfo.nsubdirs++;
+ else
+ fnode.fraginfo.nfiles++;
+ } else if (dn->is_remote()) {
+ if (dn->get_remote_d_type() == (S_IFDIR >> 12))
+ fnode.fraginfo.nsubdirs++;
+ else
+ fnode.fraginfo.nfiles++;
}
}
out << " s=" << in.inode.size;
out << " rb=" << in.inode.nested.rbytes << "/" << in.inode.accounted_nested.rbytes;
out << " rf=" << in.inode.nested.rfiles << "/" << in.inode.accounted_nested.rfiles;
+ out << " rd=" << in.inode.nested.rsubdirs << "/" << in.inode.accounted_nested.rsubdirs;
// locks
out << " " << in.authlock;
dout(10) << "clear_dirty_scattered " << type << " on " << *this << dendl;
switch (type) {
case CEPH_LOCK_IDIR:
- xlist_dirty_inode_mtime.remove_myself();
+ xlist_dirty_dirfrag_dir.remove_myself();
break;
case CEPH_LOCK_INESTED:
- assert(0); // hmm!
+ xlist_dirty_dirfrag_nested.remove_myself();
break;
default:
xlist<CInode*>::item xlist_dirty;
public:
xlist<CInode*>::item xlist_open_file;
- xlist<CInode*>::item xlist_dirty_inode_mtime;
+ xlist<CInode*>::item xlist_dirty_dirfrag_dir;
+ xlist<CInode*>::item xlist_dirty_dirfrag_nested;
xlist<CInode*>::item xlist_purging_inode;
private:
inode_auth(CDIR_AUTH_DEFAULT),
replica_caps_wanted(0),
xlist_dirty(this), xlist_open_file(this),
- xlist_dirty_inode_mtime(this), xlist_purging_inode(this),
+ xlist_dirty_dirfrag_dir(this),
+ xlist_dirty_dirfrag_nested(this),
+ xlist_purging_inode(this),
auth_pins(0), nested_auth_pins(0),
versionlock(this, CEPH_LOCK_IVERSION, WAIT_VERSIONLOCK_OFFSET),
authlock(this, CEPH_LOCK_IAUTH, WAIT_AUTHLOCK_OFFSET),
return scatter_wrlock_finish((ScatterLock*)lock, mut);
case CEPH_LOCK_IVERSION:
return local_wrlock_finish((LocalLock*)lock, mut);
+ case CEPH_LOCK_IFILE:
+ return file_wrlock_finish((FileLock*)lock, mut);
default:
assert(0);
}
if (in->filelock.can_lease()) mask |= CEPH_LOCK_IFILE;
}
if (in->xattrlock.can_lease()) mask |= CEPH_LOCK_IXATTR;
+ //if (in->nestedlock.can_lease()) mask |= CEPH_LOCK_INESTED;
_issue_client_lease(in, mask, pool, client, bl, now, session);
return mask;
// nested ---------------------------------------------------------------
-void Locker::predirty_nested(Mutation *mut, EMetaBlob *blob, CInode *in, bool parent_mtime)
+void Locker::predirty_nested(Mutation *mut, EMetaBlob *blob, CInode *in,
+ bool do_parent, int dfiles, int dsubdirs)
{
+ dout(10) << "predirty_nested "
+ << do_parent << "/" << dfiles << "/" << dsubdirs
+ << " " << *in << dendl;
+
CDir *parent = in->get_projected_parent_dn()->get_dir();
- // initial diff from *in
inode_t *curi = in->get_projected_inode();
- __u64 drbytes;
- __u64 drfiles;
- utime_t rctime;
- if (in->is_dir()) {
- drbytes = curi->nested.rbytes - curi->accounted_nested.rbytes;
- drfiles = curi->nested.rfiles - curi->accounted_nested.rfiles;
- rctime = MAX(curi->ctime, curi->nested.rctime);
- } else {
- drbytes = curi->size - curi->accounted_nested.rbytes;
- drfiles = 1 - curi->accounted_nested.rfiles;
- rctime = curi->ctime;
+ if (curi->is_file()) {
+ curi->nested.rbytes = curi->size;
}
-
- dout(10) << "predirty_nested delta " << drbytes << " bytes / " << drfiles << " files from " << *in << dendl;
-
+
// build list of inodes to wrlock, dirty, and update
list<CInode*> lsi;
CInode *cur = in;
// opportunistically adjust parent dirfrag
CInode *pin = parent->get_inode();
- dout(10) << "predirty_nested delta " << drbytes << " bytes / " << drfiles << " files for " << *pin << dendl;
- if (pin->is_base())
- break;
+ if (do_parent) {
+ assert(mut->wrlocks.count(&pin->dirlock));
+ assert(mut->wrlocks.count(&pin->nestedlock));
+ }
- if (mut->wrlocks.count(&pin->dirlock) == 0 &&
- !scatter_wrlock_try(&pin->dirlock, mut)) {
- dout(10) << "predirty_nested can't wrlock " << pin->dirlock << " on " << *pin << dendl;
+ if (mut->wrlocks.count(&pin->nestedlock) == 0 &&
+ !scatter_wrlock_try(&pin->nestedlock, mut)) {
+ dout(10) << "predirty_nested can't wrlock " << pin->nestedlock << " on " << *pin << dendl;
break;
}
// inode -> dirfrag
+ __u64 drbytes = curi->nested.rbytes - curi->accounted_nested.rbytes;
+ __u64 drfiles = curi->nested.rfiles - curi->accounted_nested.rfiles;
+ __u64 drsubdirs = curi->nested.rsubdirs - curi->accounted_nested.rsubdirs;
+ utime_t rctime = MAX(curi->ctime, curi->nested.rctime);
+
mut->add_projected_fnode(parent);
fnode_t *pf = parent->project_fnode();
pf->version = parent->pre_dirty();
- if (parent_mtime) {
- dout(10) << "predirty_nested updating mtime on " << *parent << dendl;
- pf->mtime = rctime = mut->now;
- }
+ if (do_parent) {
+ dout(10) << "predirty_nested updating mtime/size on " << *parent << dendl;
+ pf->fraginfo.mtime = mut->now;
+ pf->fraginfo.nfiles += dfiles;
+ pf->fraginfo.nsubdirs += dsubdirs;
+ //pf->nested.rfiles += dfiles;
+ //pf->nested.rsubdirs += dsubdirs;
+ }
+ dout(10) << "predirty_nested delta "
+ << drbytes << " bytes / " << drfiles << " files / " << drsubdirs << " subdirs for "
+ << *parent << dendl;
pf->nested.rbytes += drbytes;
pf->nested.rfiles += drfiles;
+ pf->nested.rsubdirs += drsubdirs;
pf->nested.rctime = rctime;
-
+
curi->accounted_nested.rbytes += drbytes;
curi->accounted_nested.rfiles += drfiles;
+ curi->accounted_nested.rsubdirs += drsubdirs;
curi->accounted_nested.rctime = rctime;
- if (!pin->is_auth())
+ if (pin->is_base())
break;
+ if (!pin->is_auth()) {
+ if (do_parent)
+ mut->ls->dirty_dirfrag_dir.push_back(&pin->xlist_dirty_dirfrag_dir);
+ mut->ls->dirty_dirfrag_nested.push_back(&pin->xlist_dirty_dirfrag_nested);
+ break;
+ }
// dirfrag -> diri
mut->add_projected_inode(pin);
version_t ppv = pin->pre_dirty();
inode_t *pi = pin->project_inode();
pi->version = ppv;
- if (pf->mtime > pi->mtime)
- pi->mtime = pf->mtime;
+ if (do_parent) {
+ dout(10) << "predirty_nested updating size/mtime on " << *pin << dendl;
+ if (pf->fraginfo.mtime > pi->mtime)
+ pi->mtime = pf->fraginfo.mtime;
+ pi->size += pf->fraginfo.size() - pf->accounted_fraginfo.size();
+ pf->accounted_fraginfo = pf->fraginfo;
+ }
+ drbytes = pf->nested.rbytes - pf->accounted_nested.rbytes;
+ drfiles = pf->nested.rfiles - pf->accounted_nested.rfiles;
+ drsubdirs = pf->nested.rsubdirs - pf->accounted_nested.rsubdirs;
+ dout(10) << "predirty_nested delta "
+ << drbytes << " bytes / " << drfiles << " files / " << drsubdirs << " subdirs for "
+ << *pin << dendl;
pi->nested.rbytes += drbytes;
pi->nested.rfiles += drfiles;
- pi->nested.rctime = rctime;
+ pi->nested.rsubdirs += drsubdirs;
+ pi->nested.rctime = MAX(pf->fraginfo.mtime, pf->nested.rctime);
+ pf->accounted_nested = pf->nested;
- pf->accounted_nested.rbytes += drbytes;
- pf->accounted_nested.rfiles += drfiles;
- pf->accounted_nested.rctime = rctime;
-
// next parent!
cur = pin;
curi = pi;
parent = cur->get_projected_parent_dn()->get_dir();
-
- drbytes = curi->nested.rbytes - curi->accounted_nested.rbytes;
- drfiles = curi->nested.rfiles - curi->accounted_nested.rfiles;
- rctime = MAX(curi->ctime, curi->nested.rctime);
+ do_parent = false;
}
// now, stick it in the blob
void scatter_writebehind_finish(ScatterLock *lock, LogSegment *ls);
public:
- void predirty_nested(Mutation *mut, EMetaBlob *blob, CInode *in, bool parent_mtime);
+ void predirty_nested(Mutation *mut, EMetaBlob *blob, CInode *in, bool do_parent, int dfiles=0, int dsubdirs=0);
// local
protected:
xlist<CDentry*> dirty_dentries;
xlist<CInode*> open_files;
- xlist<CInode*> dirty_inode_mtimes;
+ xlist<CInode*> dirty_dirfrag_dir;
+ xlist<CInode*> dirty_dirfrag_nested;
xlist<MDSlaveUpdate*> slave_updates;
rejoin->add_weak_primary_dentry(dir->dirfrag(), p->first, dn->get_inode()->ino());
dn->get_inode()->get_nested_dirfrags(nested);
- if (dn->get_inode()->dirlock.is_updated()) {
+ if (dn->get_inode()->dirlock.is_updated()) { // ******* FIXME *********
// include full inode to shed any dirtyscattered state
rejoin->add_full_inode(dn->get_inode()->inode,
dn->get_inode()->symlink,
else
rdlocks.insert(&dn->lock); // existing dn, rdlock
wrlocks.insert(&dn->dir->inode->dirlock); // also, wrlock on dir mtime
+ wrlocks.insert(&dn->dir->inode->nestedlock); // also, wrlock on dir mtime
if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
return 0;
dout(10) << "predirty_dn_diri (non-auth) ctime/mtime " << mdr->now << " on " << *diri << dendl;
- blob->add_dirtied_inode_mtime(diri->ino(), mdr->now);
+ //blob->add_dirtied_inode_mtime(diri->ino(), mdr->now);
assert(mdr->ls);
- mdr->ls->dirty_inode_mtimes.push_back(&diri->xlist_dirty_inode_mtime);
+ mdr->ls->dirty_dirfrag_dir.push_back(&diri->xlist_dirty_dirfrag_dir);
}
return dirpv;
if ((newi->inode.mode & S_IFMT) == 0)
newi->inode.mode |= S_IFREG;
newi->inode.version = dn->pre_dirty() - 1;
+ newi->inode.nested.rfiles = 1;
dout(10) << "mknod mode " << newi->inode.mode << " rdev " << newi->inode.rdev << dendl;
le->metablob.add_client_req(req->get_reqid());
le->metablob.add_allocated_ino(newi->ino(), mds->idalloc->get_version());
- mds->locker->predirty_nested(mdr, &le->metablob, newi, true);
+ mds->locker->predirty_nested(mdr, &le->metablob, newi, true, 1, 0);
//version_t dirpv = predirty_dn_diri(mdr, dn, &le->metablob); // dir mtime too
le->metablob.add_primary_dentry(dn, true, newi, &newi->inode);
newi->inode.mode |= S_IFDIR;
newi->inode.layout = g_default_mds_dir_layout;
newi->inode.version = dn->pre_dirty() - 1;
+ newi->inode.nested.rsubdirs = 1;
// ...and that new dir is empty.
CDir *newdir = newi->get_or_open_dirfrag(mds->mdcache, frag_t());
EUpdate *le = new EUpdate(mdlog, "mkdir");
le->metablob.add_client_req(req->get_reqid());
le->metablob.add_allocated_ino(newi->ino(), mds->idalloc->get_version());
- mds->locker->predirty_nested(mdr, &le->metablob, newi, true);
+ mds->locker->predirty_nested(mdr, &le->metablob, newi, true, 0, 1);
le->metablob.add_primary_dentry(dn, true, newi, &newi->inode);
le->metablob.add_dir(newdir, true, true); // dirty AND complete
newi->symlink = req->get_path2();
newi->inode.size = newi->symlink.length();
newi->inode.version = dn->pre_dirty() - 1;
-
+ newi->inode.nested.rfiles = 1;
+
// prepare finisher
mdr->ls = mdlog->get_current_segment();
EUpdate *le = new EUpdate(mdlog, "symlink");
le->metablob.add_client_req(req->get_reqid());
le->metablob.add_allocated_ino(newi->ino(), mds->idalloc->get_version());
- mds->locker->predirty_nested(mdr, &le->metablob, newi, true);
+ mds->locker->predirty_nested(mdr, &le->metablob, newi, true, 1, 0);
le->metablob.add_primary_dentry(dn, true, newi, &newi->inode);
// log + wait
case CEPH_LOCK_IDFT: return "idft";
case CEPH_LOCK_IDIR: return "idir";
case CEPH_LOCK_IXATTR: return "ixattr";
+ case CEPH_LOCK_INESTED: return "inested";
case CEPH_LOCK_INO: return "ino";
default: assert(0); return 0;
}
*/
public:
struct dirlump {
- static const int STATE_COMPLETE = (1<<1);
- static const int STATE_DIRTY = (1<<2); // dirty due to THIS journal item, that is!
+ static const int STATE_COMPLETE = (1<<1);
+ static const int STATE_DIRTY = (1<<2); // dirty due to THIS journal item, that is!
//version_t dirv;
fnode_t fnode;
list<version_t> atids;
// inode dirlocks (scatterlocks) i've touched.
- map<inodeno_t, utime_t> dirty_inode_mtimes;
+ //map<inodeno_t, utime_t> dirty_inode_mtimes;
// ino's i've allocated
list<inodeno_t> allocated_inos;
::encode(lump_order, bl);
::encode(lump_map, bl);
::encode(atids, bl);
- ::encode(dirty_inode_mtimes, bl);
+ //::encode(dirty_inode_mtimes, bl);
::encode(allocated_inos, bl);
if (!allocated_inos.empty())
::encode(alloc_tablev, bl);
::decode(lump_order, bl);
::decode(lump_map, bl);
::decode(atids, bl);
- ::decode(dirty_inode_mtimes, bl);
+ //::decode(dirty_inode_mtimes, bl);
::decode(allocated_inos, bl);
if (!allocated_inos.empty())
::decode(alloc_tablev, bl);
atids.push_back(atid);
}
+ /*
void add_dirtied_inode_mtime(inodeno_t ino, utime_t ctime) {
dirty_inode_mtimes[ino] = ctime;
}
+ */
void add_allocated_ino(inodeno_t ino, version_t tablev) {
allocated_inos.push_back(ino);
}
// dirty non-auth mtimes
- for (xlist<CInode*>::iterator p = dirty_inode_mtimes.begin(); !p.end(); ++p) {
+ for (xlist<CInode*>::iterator p = dirty_dirfrag_dir.begin(); !p.end(); ++p) {
CInode *in = *p;
- dout(10) << "try_to_expire waiting for dirlock mtime flush on " << *in << dendl;
+ dout(10) << "try_to_expire waiting for dirlock flush on " << *in << dendl;
if (!gather) gather = new C_Gather;
if (in->is_ambiguous_auth()) {
}
//(*p)->dirlock.add_waiter(SimpleLock::WAIT_STABLE, gather->new_sub());
}
+ for (xlist<CInode*>::iterator p = dirty_dirfrag_nested.begin(); !p.end(); ++p) {
+ CInode *in = *p;
+ dout(10) << "try_to_expire waiting for nestedlock flush on " << *in << dendl;
+ if (!gather) gather = new C_Gather;
+
+ if (in->is_ambiguous_auth()) {
+ dout(10) << " waiting for single auth on " << *in << dendl;
+ in->add_waiter(MDSCacheObject::WAIT_SINGLEAUTH, gather->new_sub());
+ } else if (in->is_auth()) {
+ dout(10) << " i'm auth, unscattering nestedlock on " << *in << dendl;
+ assert(in->is_replicated()); // hrm!
+ mds->locker->scatter_lock(&in->nestedlock);
+ in->nestedlock.add_waiter(SimpleLock::WAIT_STABLE, gather->new_sub());
+ } else {
+ dout(10) << " i'm a replica, requesting nestedlock unscatter of " << *in << dendl;
+ mds->locker->scatter_try_unscatter(&in->nestedlock, gather->new_sub());
+ }
+ //(*p)->nestedlock.add_waiter(SimpleLock::WAIT_STABLE, gather->new_sub());
+ }
// open files
if (!open_files.empty()) {
dout(10) << "EMetaBlob.replay added dir " << *dir << dendl;
}
dir->set_version( lump.fnode.version );
- if (lump.is_dirty())
+ if (lump.is_dirty()) {
dir->_mark_dirty(logseg);
+ dir->get_inode()->dirlock.set_updated();
+ dir->get_inode()->nestedlock.set_updated();
+ }
+
if (lump.is_complete())
dir->mark_complete();
mds->anchorclient->got_journaled_agree(*p, logseg);
}
- // dirtied inode mtimes
+ /*// dirtied inode mtimes
if (!dirty_inode_mtimes.empty())
for (map<inodeno_t,utime_t>::iterator p = dirty_inode_mtimes.begin();
p != dirty_inode_mtimes.end();
in->dirlock.set_updated();
logseg->dirty_inode_mtimes.push_back(&in->xlist_dirty_inode_mtime);
}
+ */
// allocated_inos
if (!allocated_inos.empty()) {