/ - STICKY dir state and pin? make sure it's kept across import/export/fragment
/ - pull _bound maps out of Migrator; they are redundant (trust the subtree map!)
- - handle_resolve needs to infer splits/merges
+/ - handle_resolve needs to infer splits/merges
- rejoin, too!
- - auth journals and applies update in the request update pipeline
+/ - auth journals and applies update in the request update pipeline
- - dirfragtree is lazily consistent. no lock. bcast by primary when it updates.
- --> this makes it tricky to properly journal dirfragtree on the auth inode. what about a scatterlock?
-
+/ - dirfragtree is lazily consistent. no lock. bcast by primary when it updates.
+/ - bcast to dir replicas
- - bcast to dir replicas
- - inode auth will journal inode update separately/lazily
- - also on handle_resolve(), if there is a mismatch.
- - do i need a fragtrace_t something to tell me where the splits for a given frag occurred?
- - or something like a fragtree_t simplify()?
- - is there any reason to freeze the dir?
- - CDentry objects will be moved to the new frag(s)
- - Server etc. must take care not to carry CDir pointers around; they're unstable!
-
- - what about flushing the old dirfrag storage off disk...?
+/ - inode auth will journal inode update separately/lazily
+/ - via subtree_merge_at
- journal epoch, or something similar
if (dn->dir->items.empty())
dn->dir->put(PIN_CHILD);
- if (nitems == 0)
+ if (nnull + nitems == 0)
get(PIN_CHILD);
if (dn->is_null())
nnull++;
break;
case LOCK_OTYPE_IDIRFRAGTREE:
- dirfragtree._encode(bl);
+ {
+ // encode the raw tree
+ dirfragtree._encode(bl);
+
+ // also specify which frags are mine
+ set<frag_t> myfrags;
+ list<CDir*> dfls;
+ get_dirfrags(dfls);
+ for (list<CDir*>::iterator p = dfls.begin(); p != dfls.end(); ++p)
+ if ((*p)->is_auth())
+ myfrags.insert((*p)->get_frag());
+ _encode(myfrags, bl);
+ }
break;
case LOCK_OTYPE_IFILE:
break;
case LOCK_OTYPE_IDIRFRAGTREE:
- dirfragtree._decode(bl, off);
+ {
+ fragtree_t temp;
+ temp._decode(bl, off);
+ set<frag_t> authfrags;
+ _decode(authfrags, bl, off);
+ if (is_auth()) {
+ // auth. believe replica's auth frags only.
+ for (set<frag_t>::iterator p = authfrags.begin(); p != authfrags.end(); ++p)
+ dirfragtree.force_to_leaf(*p);
+ } else {
+ // replica. just take the tree.
+ dirfragtree.swap(temp);
+ }
+ }
break;
case LOCK_OTYPE_IFILE:
LocalLock versionlock;
SimpleLock authlock;
SimpleLock linklock;
- SimpleLock dirfragtreelock;
+ ScatterLock dirfragtreelock;
FileLock filelock;
ScatterLock dirlock;
switch (lock->get_type()) {
case LOCK_OTYPE_IFILE:
return file_rdlock_start((FileLock*)lock, mdr);
+ case LOCK_OTYPE_IDIRFRAGTREE:
case LOCK_OTYPE_IDIR:
return scatter_rdlock_start((ScatterLock*)lock, mdr);
default:
switch (lock->get_type()) {
case LOCK_OTYPE_IFILE:
return file_rdlock_finish((FileLock*)lock, mdr);
+ case LOCK_OTYPE_IDIRFRAGTREE:
case LOCK_OTYPE_IDIR:
return scatter_rdlock_finish((ScatterLock*)lock, mdr);
default:
bool Locker::wrlock_start(SimpleLock *lock, MDRequest *mdr)
{
switch (lock->get_type()) {
+ case LOCK_OTYPE_IDIRFRAGTREE:
case LOCK_OTYPE_IDIR:
return scatter_wrlock_start((ScatterLock*)lock, mdr);
case LOCK_OTYPE_IVERSION:
void Locker::wrlock_finish(SimpleLock *lock, MDRequest *mdr)
{
switch (lock->get_type()) {
+ case LOCK_OTYPE_IDIRFRAGTREE:
case LOCK_OTYPE_IDIR:
return scatter_wrlock_finish((ScatterLock*)lock, mdr);
case LOCK_OTYPE_IVERSION:
return file_xlock_start((FileLock*)lock, mdr);
case LOCK_OTYPE_IVERSION:
return local_xlock_start((LocalLock*)lock, mdr);
+ case LOCK_OTYPE_IDIRFRAGTREE:
case LOCK_OTYPE_IDIR:
assert(0);
default:
return file_xlock_finish((FileLock*)lock, mdr);
case LOCK_OTYPE_IVERSION:
return local_xlock_finish((LocalLock*)lock, mdr);
+ case LOCK_OTYPE_IDIRFRAGTREE:
case LOCK_OTYPE_IDIR:
assert(0);
default:
case LOCK_OTYPE_DN:
case LOCK_OTYPE_IAUTH:
case LOCK_OTYPE_ILINK:
- case LOCK_OTYPE_IDIRFRAGTREE:
handle_simple_lock(lock, m);
break;
handle_file_lock((FileLock*)lock, m);
break;
+ case LOCK_OTYPE_IDIRFRAGTREE:
case LOCK_OTYPE_IDIR:
handle_scatter_lock((ScatterLock*)lock, m);
break;
}
// wait for write.
- lock->add_waiter(SimpleLock::WAIT_WR|SimpleLock::WAIT_STABLE, new C_MDS_RetryRequest(mdcache, mdr));
+ lock->add_waiter(SimpleLock::WAIT_WR|SimpleLock::WAIT_STABLE,
+ new C_MDS_RetryRequest(mdcache, mdr));
// initiate scatter or lock?
if (lock->is_stable()) {
inode_t *pi = in->project_inode();
pi->version = in->pre_dirty();
- EUpdate *le = new EUpdate("dir.mtime writebehind");
+ EUpdate *le = new EUpdate("scatter writebehind");
le->metablob.add_dir_context(in->get_parent_dn()->get_dir());
le->metablob.add_primary_dentry(in->get_parent_dn(), true, 0, pi);
try_subtree_merge_at(*p);
}
+class C_MDC_SubtreeMergeWB : public Context {
+ MDCache *mdcache;
+ CInode *in;
+public:
+ C_MDC_SubtreeMergeWB(MDCache *mdc, CInode *i) : mdcache(mdc), in(i) {}
+ void finish(int r) {
+ mdcache->subtree_merge_writebehind_finish(in);
+ }
+};
+
void MDCache::try_subtree_merge_at(CDir *dir)
{
dout(10) << "try_subtree_merge_at " << *dir << endl;
subtrees[parent].erase(dir);
eval_subtree_root(dir);
+
+ // journal inode?
+ // (this is a large hammer to ensure that dirfragtree updates will
+ // hit the disk before the relevant dirfrags ever close)
+ if (dir->inode->is_auth() &&
+ dir->inode->can_auth_pin()) {
+ CInode *in = dir->inode;
+ dout(10) << "try_subtree_merge_at journaling merged bound " << *in << endl;
+
+ in->auth_pin();
+
+ // journal write-behind.
+ inode_t *pi = in->project_inode();
+ pi->version = in->pre_dirty();
+
+ EUpdate *le = new EUpdate("subtree merge writebehind");
+ le->metablob.add_dir_context(in->get_parent_dn()->get_dir());
+ le->metablob.add_primary_dentry(in->get_parent_dn(), true, 0, pi);
+
+ mds->mdlog->submit_entry(le);
+ mds->mdlog->wait_for_sync(new C_MDC_SubtreeMergeWB(this, in));
+ }
}
show_subtrees(15);
}
+void MDCache::subtree_merge_writebehind_finish(CInode *in)
+{
+ dout(10) << "subtree_merge_writebehind_finish on " << in << endl;
+ in->pop_and_dirty_projected_inode();
+ in->auth_unpin();
+}
+
void MDCache::eval_subtree_root(CDir *dir)
{
// evaluate subtree inode dirlock?
mds->locker->scatter_eval(&dir->inode->dirlock);
else
mds->locker->try_scatter_eval(&dir->inode->dirlock); // ** may or may not be auth_pinned **
- }
+ }
+
}
++pi) {
CInode *diri = get_inode(pi->first.ino);
if (!diri) continue;
- diri->dirfragtree.force_to_leaf(pi->first.frag);
+ bool forced = diri->dirfragtree.force_to_leaf(pi->first.frag);
+ if (forced) {
+ dout(10) << " forced frag " << pi->first.frag << " to leaf in "
+ << diri->dirfragtree
+ << " on " << pi->first << endl;
+ }
CDir *dir = diri->get_dirfrag(pi->first.frag);
if (!dir) continue;
adjust_bounded_subtree_auth(dir, pi->second, from);
void MDCache::fragment_freeze(CInode *diri, list<CDir*>& frags, frag_t basefrag, int bits)
{
C_Gather *gather = new C_Gather(new C_MDC_FragmentGo(this, diri, frags, basefrag, bits));
-
+
+ // freeze the dirs
for (list<CDir*>::iterator p = frags.begin();
p != frags.end();
++p) {
void adjust_export_state(CDir *dir);
void try_subtree_merge(CDir *root);
void try_subtree_merge_at(CDir *root);
+ void subtree_merge_writebehind_finish(CInode *in);
void eval_subtree_root(CDir *dir);
CDir *get_subtree_root(CDir *dir);
void remove_subtree(CDir *dir);