- xml import/export?
- ?
+- pg monitor service
+ - to support statfs?
+ - general pg health
+ - some sort of (throttled) osd status reporting
+ - dynamic pg creation (eventually!)
+
- SimpleMessenger
- clean up/merge Messenger/Dispatcher interfaces
- auto close idle connections
- generalize monitor client?
- throttle message resend attempts
-- paxos layer work
- - integrate leasing into paxos framework
- - carefully interface design...
-
code cleanup
- endian portability
general kernel planning
- soft consistency on lookup?
-
+- accurate reconstruction of (syscall) path?
- unlink needs to journal on witnesses (probably), since unlinked inodes may be in those journals
-/- make locks auth_pin for unstable states.
-/ - can locker *_eval() starve freezing?
-
-/- fix rename to delay the _apply.
-/ - need to fix locking vs migration first.
-/- rename_prep should create a subtree (auth,auth) so that cache expires are routed properly
-
-- then, mtimes:
-/ - avoid migration race concern (on auth).
-/ - writeback and dirty on gather.
-/ - cleaned up pv/pi makes writebehind play nice with concurrent updates.
-/ - should pin lock in LOCK state? (acdtually, gather)
-/ - scatterlock 'updated' flag.
- - on replica, clear only on sync | rescatter.
- - make sure "dirty" scatterlock prevents journal expire.
- - EMetaBlob map<inodeno_t,utime_t> dirty_scatter;
- - mtime must be greater, or scatterlock must be !updated.
-
-
-/- fix slave op commit/abort logic:
-/ - recovering node needs to know what stray prepare ops committed
-/ - include with import_map
-/ - wait for explicit commit/abort from peer.
-/ - surviving node needs to
-/ - wait for log to flush (commits to finish), then
-/ - for uncommitted master requests,
-/ - remove failed from witnesses, waiting_on_slave, and
-/ - redispatch
-/ - somehow wait for needed peers to recover...
-/ - for uncommitted slave requests,
-/ - include with import_map, wait for explicit commit/abort from peer.
-
-/- make unlink/link behave with commit/abort recovery
-
-/- new thrashing test with
-/ - link, unlink, and rename (lots of hard links!)
-/ - directory renames
-
-- fix up writeback of dir inode mtime
-- revisit wrlocks, dir inode mtime updates. esp in rename.
- - if auth, pin and be happy. decide early.
- - make no attempt to dirty inodes until a gather
-/ - pin scattered inodes
- - mtime will always get journaled...
- -> so, just make sure v/pv/dirtyness is sane on recovery...
- -> scatterlock should recover into scatter state, or whatever...
-
-
-- Q: locker vs migration
- - maybe unstable lock states should auth_pin. would simplify migration logic, and probably avoid a number of bugs. basically, the freeze would have to wait for any in-progress lock gathers (not that long!).
- - ...but does it play nice with wrlock? and unfortuantely auth_pinning wrlocks would kill performance on updates at dir delegation points.
- - ????
-
- stray reintegration
- stray purge on shutdown
- need to export stray crap to another mds..
- verify stray is empty on shutdown
-/- roll EAlloc into EMetaBlob (and maybe Purge)
-
- dir complete flag on migration.. does it go into the EMetaBlob too? can it be safely dropped?
- journal+recovery
// glockc -> lock?
else if (lock->get_state() == LOCK_GLOCKC &&
!lock->is_gathering() &&
- !lock->is_wrlocked() &&
- !lock->is_updated()) {
- dout(7) << "scatter_eval finished lock gather/un-wrlock on " << *lock
+ !lock->is_wrlocked()) {
+ if (lock->is_updated()) {
+ scatter_writebehind(lock);
+ } else {
+ dout(7) << "scatter_eval finished lock gather/un-wrlock on " << *lock
<< " on " << *lock->get_parent() << endl;
- lock->set_state(LOCK_LOCK);
- //lock->get_parent()->put(CInode::PIN_SCATTERED);
- lock->finish_waiters(ScatterLock::WAIT_XLOCK|ScatterLock::WAIT_STABLE);
- lock->get_parent()->auth_unpin();
+ lock->set_state(LOCK_LOCK);
+ //lock->get_parent()->put(CInode::PIN_SCATTERED);
+ lock->finish_waiters(ScatterLock::WAIT_XLOCK|ScatterLock::WAIT_STABLE);
+ lock->get_parent()->auth_unpin();
+ }
}
// gSyncL -> sync?
else if ((lock->get_state() == LOCK_GTEMPSYNCC ||
lock->get_state() == LOCK_GTEMPSYNCL) &&
!lock->is_gathering() &&
- !lock->is_wrlocked() &&
- !lock->is_updated()) {
- dout(7) << "scatter_eval finished tempsync gather/un-wrlock on " << *lock
- << " on " << *lock->get_parent() << endl;
- lock->set_state(LOCK_TEMPSYNC);
- lock->finish_waiters(ScatterLock::WAIT_RD|ScatterLock::WAIT_STABLE);
- lock->get_parent()->auth_unpin();
+ !lock->is_wrlocked()) {
+ if (lock->is_updated()) {
+ scatter_writebehind(lock);
+ } else {
+ dout(7) << "scatter_eval finished tempsync gather/un-wrlock on " << *lock
+ << " on " << *lock->get_parent() << endl;
+ lock->set_state(LOCK_TEMPSYNC);
+ lock->finish_waiters(ScatterLock::WAIT_RD|ScatterLock::WAIT_STABLE);
+ lock->get_parent()->auth_unpin();
+ }
}
}
}
+void Locker::scatter_writebehind(ScatterLock *lock)
+{
+ CInode *in = (CInode*)lock->get_parent();
+ dout(10) << "scatter_writebehind on " << *lock << " on " << *in << endl;
+
+ // journal write-behind.
+ inode_t *pi = in->project_inode();
+ pi->version = in->pre_dirty();
+
+ EUpdate *le = new EUpdate("dir.mtime writebehind");
+ le->metablob.add_dir_context(in->get_parent_dn()->get_dir());
+ le->metablob.add_primary_dentry(in->get_parent_dn(), true, 0, pi);
+
+ mds->mdlog->submit_entry(le);
+ mds->mdlog->wait_for_sync(new C_Locker_ScatterWB(this, lock));
+}
+
+void Locker::scatter_writebehind_finish(ScatterLock *lock)
+{
+ CInode *in = (CInode*)lock->get_parent();
+ dout(10) << "scatter_writebehind_finish on " << *lock << " on " << *in << endl;
+ in->pop_and_dirty_projected_inode();
+ lock->clear_updated();
+ scatter_eval_gather(lock);
+}
+
void Locker::scatter_eval(ScatterLock *lock)
{
dout(10) << "scatter_eval " << *lock << " on " << *lock->get_parent() << endl;
dout(7) << "handle_scatter_lock " << *lock << " on " << *lock->get_parent()
<< " from " << from << ", last one"
<< endl;
-
- if (lock->is_updated()) {
- // journal write-behind.
- CInode *in = (CInode*)lock->get_parent();
- inode_t *pi = in->project_inode();
- pi->version = in->pre_dirty();
-
- EUpdate *le = new EUpdate("dir.mtime writebehind");
- le->metablob.add_dir_context(in->get_parent_dn()->get_dir());
- le->metablob.add_primary_dentry(in->get_parent_dn(), true, 0, pi);
-
- mds->mdlog->submit_entry(le);
- mds->mdlog->wait_for_sync(new C_Locker_GatherWB(this, lock));
- }
- else {
- // WARNING: this is non-optimal, but simplest.
- // just block the gather until we flush the writeback to the journal.
- scatter_eval_gather(lock);
- }
+ scatter_eval_gather(lock);
}
break;
delete m;
}
-void Locker::scatter_gather_writebehind(ScatterLock *lock)
-{
- CInode *in = (CInode*)lock->get_parent();
- dout(10) << "scatter_gather_writebehind on " << *lock << " on " << *in << endl;
- in->pop_and_dirty_projected_inode();
- lock->clear_updated();
- scatter_eval_gather(lock);
-}
bool scatter_wrlock_start(ScatterLock *lock, MDRequest *mdr);
void scatter_wrlock_finish(ScatterLock *lock, MDRequest *mdr);
- class C_Locker_GatherWB : public Context {
+ void scatter_writebehind(ScatterLock *lock);
+ class C_Locker_ScatterWB : public Context {
Locker *locker;
ScatterLock *lock;
public:
- C_Locker_GatherWB(Locker *l, ScatterLock *sl) : locker(l), lock(sl) {}
+ C_Locker_ScatterWB(Locker *l, ScatterLock *sl) : locker(l), lock(sl) {}
void finish(int r) {
- locker->scatter_gather_writebehind(lock);
+ locker->scatter_writebehind_finish(lock);
}
};
- void scatter_gather_writebehind(ScatterLock *lock);
+ void scatter_writebehind_finish(ScatterLock *lock);
// local
protected:
// i am now the subtree root.
root = dir;
+
+ eval_subtree_root(dir);
}
// adjust export pins
++p)
adjust_export_state(*p);
- // evaluate subtree inode dirlock?
- // (we should scatter the dirlock on subtree bounds)
- if (dir->inode->is_auth() &&
- dir->inode->dirlock.is_stable()) {
- // force the issue a bit
- if (!dir->inode->is_frozen())
- mds->locker->scatter_eval(&dir->inode->dirlock);
- else
- mds->locker->try_scatter_eval(&dir->inode->dirlock); // ** may or may not be auth_pinned **
- }
-
show_subtrees();
}
p != oldbounds.end();
++p)
try_subtree_merge_at(*p);
-
}
void MDCache::try_subtree_merge_at(CDir *dir)
// we are no longer a subtree or bound
subtrees.erase(dir);
subtrees[parent].erase(dir);
+
+ eval_subtree_root(dir);
}
show_subtrees(15);
}
+void MDCache::eval_subtree_root(CDir *dir)
+{
+ // evaluate subtree inode dirlock?
+ // (we should scatter the dirlock on subtree bounds)
+ if (dir->inode->is_auth() &&
+ dir->inode->dirlock.is_stable()) {
+ // force the issue a bit
+ if (!dir->inode->is_frozen())
+ mds->locker->scatter_eval(&dir->inode->dirlock);
+ else
+ mds->locker->try_scatter_eval(&dir->inode->dirlock); // ** may or may not be auth_pinned **
+ }
+}
+
void MDCache::adjust_bounded_subtree_auth(CDir *dir, set<CDir*>& bounds, pair<int,int> auth)
{
while (1) {
// inode?
dout(10) << " " << i << ": " << anchortrace[i-1] << endl;
- CInode *in = get_inode(anchortrace[i-1].ino);
+ in = get_inode(anchortrace[i-1].ino);
if (in) break;
i--;
if (!i) {
- CInode *in = get_inode(anchortrace[i].dirfrag.ino);
- assert(in);
+ in = get_inode(anchortrace[i].dirfrag.ino);
+ assert(in); // actually, we may need to open the root or a foreign stray inode, here.
break;
}
}
void adjust_export_state(CDir *dir);
void try_subtree_merge(CDir *root);
void try_subtree_merge_at(CDir *root);
+ void eval_subtree_root(CDir *dir);
CDir *get_subtree_root(CDir *dir);
void remove_subtree(CDir *dir);
void get_subtree_bounds(CDir *root, set<CDir*>& bounds);
// journal the mtime change anyway.
inode_t *ji = blob->add_primary_dentry(diri->get_parent_dn(), true);
ji->ctime = ji->mtime = mdr->now;
+
+ dout(10) << "predirty_dn_diri (non-auth) ctime/mtime " << mdr->now << " on " << *diri << endl;
blob->add_dirtied_inode_mtime(diri->ino(), mdr->now);
}
diri->pop_and_dirty_projected_inode();
dout(10) << "dirty_dn_diri ctime/mtime " << mtime << " v " << diri->inode.version << " on " << *diri << endl;
} else {
- dout(10) << "dirty_dn_diri ctime/mtime " << mtime << " (non-dirty) on " << *diri << endl;
// dirlock scatterlock will propagate the update.
diri->inode.ctime = diri->inode.mtime = mtime;
diri->dirlock.set_updated();
+ dout(10) << "dirty_dn_diri (non-dirty) ctime/mtime " << mtime << " on " << *diri << endl;
}
}