sage mds
+- CDir should avoid a store in response to temporary files (create+unlink)
+ - count+flag new dentries
+ - add last_committed_equivalent?
+
- hmm, should we move ESubtreeMap out of the journal?
that would avoid all the icky weirdness in shutdown, with periodic logging, etc.
-- fix rejoin
-/ - validate dentry<->inode connectivity
-/ - clean up remove_gather() crap
- - add_strong_* should take the cache object?
-/ - all replicated scatterlocks should start out in scatter state.
-/ - parallel_fetch
- - missing/full
- - carefully document rejoin
- - cases
- - confounding factors
-
-
-
-
-
-
-- for open file caps:
- - a survivor will issue strong caps_watned, etc. if the rejoiner doesn't have it, they can request via missing/full.
- - a recovering node:
- - mark all non-auth caps stale
- - advertise non-auth open inodes/paths/Capability::Exports in rejoins.
- - in _weak_rejoin, traverse list, and if a path is mine, add claim cap (or add to parallel_fetch list, etc.)
-
-/- fix rename.. don't journal on witnesses unless we have to.
-- fix unlink.. journal on witnesses if the file is open.
-
-- unlink needs to journal on witnesses (probably), since unlinked inodes may be in those journals
- -> hmm, no, rejoin needs to be more robust, and validate namespace changes.
-
- extend/clean up filepath to allow paths relative to an ino
- fix path_traverse
- fix reconnect/rejoin open file weirdness
- need to export stray crap to another mds..
- verify stray is empty on shutdown
-- dir complete flag on migration.. does it go into the EMetaBlob too? can it be safely dropped?
-
-- journal+recovery
- - file capabilities i/o
- dirfrag split/merge
- client readdir for dirfrags
- consistency points/snapshots
- dentry versions vs dirfrags...
- statfs?
-- finish multistage rejoin
-- trim_on_rejoin
-
- more testing of failures + thrashing.
- is export prep dir open deadlock properly fixed by forge_replica_dir()?
- failures during recovery stages (resolve, rejoin)... make sure rejoin still works!
we break commit()'s preconditions when it fetches an incomplete dir.
- detect and deal with client failure
-
-- recovering open files
- - recovery will either have inode (from EOpen), or will provide path+cap to reassert open state.
- - path+cap window will require some fetching of metadata from disk before doing the rejoin
- - failures during migration.. what about client stale/reap stuff and misplaced WR caps?
+ - failure during reconnect vs clientmap. although probalby the whole thing needs a larger overhaul...
- inode.max_size
- inode.allocated_size
- osd needs a set_floor_and_read op for safe failover/STOGITH-like semantics.
- EMetablob should return 'expired' if they have higher versions (and are thus described by a newer journal entry)
-- dir version/committed/etc versus migration, log expires.
- - DOCUMENT.
- fix rmdir empty exported dirfrag race
- export all frags <= 1 item? then we ensure freezing before empty, avoiding any last unlink + export vs rmdir race.
out << " inode=" << dn.get_inode();
+ if (dn.is_new()) out << " state=new";
+
if (dn.get_num_ref()) {
out << " |";
dn.print_pin_set(out);
// state+pin
if (!state_test(STATE_DIRTY)) {
state_set(STATE_DIRTY);
+ dir->inc_num_dirty();
get(PIN_DIRTY);
}
}
dir->mark_dirty(pv);
}
-void CDentry::mark_clean() {
+
+void CDentry::mark_clean()
+{
dout(10) << " mark_clean " << *this << endl;
assert(is_dirty());
assert(version <= dir->get_version());
- // this happens on export.
- //assert(version <= dir->get_last_committed_version());
-
// state+pin
state_clear(STATE_DIRTY);
+ dir->dec_num_dirty();
put(PIN_DIRTY);
+
+ if (state_test(STATE_NEW))
+ state_clear(STATE_NEW);
}
+void CDentry::mark_new()
+{
+ dout(10) << " mark_new " << *this << endl;
+ state_set(STATE_NEW);
+}
void CDentry::make_path(string& s)
{
class CDentry : public MDSCacheObject, public LRUObject {
public:
// -- state --
+ static const int STATE_NEW = 1;
// -- pins --
static const int PIN_INODEPIN = 1; // linked inode is pinned
void mark_dirty(version_t projected_dirv);
void mark_clean();
+ void mark_new();
+ bool is_new() { return state_test(STATE_NEW); }
// -- replication
CDentryDiscover *replicate_to(int rep);
out << " v=" << dir.get_version();
out << " cv=" << dir.get_committing_version();
out << "/" << dir.get_committed_version();
+ out << "/" << dir.get_committed_version_equivalent();
} else {
out << " rep@" << dir.authority();
if (dir.get_replica_nonce() > 1)
if (dir.state_test(CDir::STATE_IMPORTBOUND)) out << "|importbound";
out << " sz=" << dir.get_nitems() << "+" << dir.get_nnull();
+ if (dir.get_num_dirty())
+ out << " dirty=" << dir.get_num_dirty();
+
if (dir.get_num_ref()) {
out << " |";
nitems = 0;
nnull = 0;
+ num_dirty = 0;
+
state = STATE_INITIAL;
projected_version = version = 0;
//assert(null_items.count(dn->name) == 0);
items[dn->name] = dn;
-
+
if (in) {
link_inode_work( dn, in );
} else {
assert(items.count(dn->name) == 1);
items.erase(dn->name);
+ // adjust dirty counter?
+ if (dn->state_test(CDentry::STATE_DIRTY))
+ num_dirty--;
+
cache->lru.lru_remove(dn);
delete dn;
}
+void CDir::try_remove_unlinked_dn(CDentry *dn)
+{
+ assert(dn->dir == this);
+
+ if (dn->is_new() && dn->is_dirty() &&
+ dn->get_num_ref() == 1) {
+ dout(10) << "try_remove_unlinked_dn " << *dn << " in " << *this << endl;
+ dn->mark_clean();
+ remove_dentry(dn);
+
+ if (version == projected_version &&
+ committing_version == committed_version &&
+ num_dirty == 0) {
+ dout(10) << "try_remove_unlinked_dn committed_equivalent now " << version
+ << " vs committed " << committed_version
+ << endl;
+ committed_version_equivalent = committed_version;
+ }
+ }
+}
+
+
CDirDiscover *CDir::replicate_to(int mds)
return dirfrag() < ((const CDir*)r)->dirfrag();
}
- protected:
+protected:
// contents
CDir_map_t items; // non-null AND null
size_t nitems; // # non-null
size_t nnull; // # null
+ int num_dirty;
+
// state
version_t version;
version_t committing_version;
version_t committed_version;
+ version_t committed_version_equivalent; // in case of, e.g., temporary file
version_t projected_version;
// lock nesting, freeze
}
size_t get_nitems() { return nitems; }
size_t get_nnull() { return nnull; }
-
- /*
- float get_popularity() {
- return popularity[0].get();
- }
- */
+ void inc_num_dirty() { num_dirty++; }
+ void dec_num_dirty() {
+ assert(num_dirty > 0);
+ num_dirty--;
+ }
+ int get_num_dirty() {
+ return num_dirty;
+ }
+
+ void try_remove_unlinked_dn(CDentry *dn);
// -- dentries and inodes --
public:
version_t get_projected_version() { return projected_version; }
version_t get_committing_version() { return committing_version; }
version_t get_committed_version() { return committed_version; }
+ version_t get_committed_version_equivalent() { return committed_version_equivalent; }
void set_committed_version(version_t v) { committed_version = v; }
version_t pre_dirty(version_t min=0);
root->inode.nlink = 1;
root->inode.layout = g_OSD_MDDirLayout;
- root->force_auth = pair<int,int>(mds->get_nodeid(), CDIR_AUTH_UNKNOWN);
+ root->force_auth = pair<int,int>(0, CDIR_AUTH_UNKNOWN);
set_root( root );
add_inode( root );
p = n;
}
+ // adjust export pins
+ adjust_export_state(dir);
+ for (set<CDir*>::iterator p = subtrees[dir].begin();
+ p != subtrees[dir].end();
+ ++p)
+ adjust_export_state(*p);
+
// bound should now match.
verify_subtree_bounds(dir, bounds);
+/**
+ * rejoin_trim_undef_inodes() -- remove REJOINUNDEF flagged inodes
+ *
+ * FIXME: wait, can this actually happen? a survivor should generate cache trim
+ * messages that clean these guys up...
+ */
void MDCache::rejoin_trim_undef_inodes()
{
dout(10) << "rejoin_trim_undef_inodes" << endl;
}
}
+ assert(rejoin_undef_inodes.empty()); // hmm: this shouldn't ever happen, actually!
rejoin_undef_inodes.clear();
}
purging[inode->ino][newsize] = *inode;
assert(inode->size > newsize);
+ _do_purge_inode(inode, newsize);
+}
+void MDCache::_do_purge_inode(inode_t *inode, off_t newsize)
+{
// remove
- mds->filer->remove(*inode, newsize, inode->size,
- 0, new C_MDC_PurgeFinish(this, inode->ino, newsize));
-
- /*} else {
+ if (inode->size > 0) {
+ mds->filer->remove(*inode, newsize, inode->size,
+ 0, new C_MDC_PurgeFinish(this, inode->ino, newsize));
+ } else {
// no need, empty file, just log it
purge_inode_finish(inode->ino, newsize);
}
- */
}
void MDCache::purge_inode_finish(inodeno_t ino, off_t newsize)
dout(10) << "start_recovered_purges " << p->first
<< " size " << q->second.size
<< " to " << q->first << endl;
- mds->filer->remove(q->second, q->first, q->second.size,
- 0, new C_MDC_PurgeFinish(this, p->first, q->first));
+ _do_purge_inode(&q->second, q->first);
}
}
}
// log removal
version_t pdv = dn->pre_dirty();
- EUpdate *le = new EUpdate;
+ EUpdate *le = new EUpdate("purge_stray");
le->metablob.add_dir_context(dn->dir);
le->metablob.add_null_dentry(dn, true);
le->metablob.add_inode_truncate(dn->inode->inode, 0);
dn->dir->remove_dentry(dn);
// purge+remove inode
- if (in->inode.size > 0)
- purge_inode(&in->inode, 0);
+ purge_inode(&in->inode, 0);
remove_inode(in);
}
public:
// inode purging
void purge_inode(inode_t *inode, off_t newsize);
+ void _do_purge_inode(inode_t *inode, off_t newsize);
void purge_inode_finish(inodeno_t ino, off_t newsize);
void purge_inode_finish_2(inodeno_t ino, off_t newsize);
bool is_purging(inodeno_t ino, off_t newsize) {
void MDS::reconnect_done()
{
dout(1) << "reconnect_done" << endl;
+ set_want_state(MDSMap::STATE_REJOIN); // move to rejoin state
+
+ /*
if (mdsmap->get_num_in_mds() == 1 &&
mdsmap->get_num_mds(MDSMap::STATE_FAILED) == 0) { // just me!
} else {
set_want_state(MDSMap::STATE_REJOIN); // move to rejoin state
}
+ */
}
void MDS::rejoin_joint_start()
// create
dn = dir->add_dentry(dname, 0);
+ dn->mark_new();
dout(10) << "prepare_null_dentry added " << *dn << endl;
return dn;
// clean up?
if (straydn)
mdcache->eval_stray(straydn);
+
+ // removing a new dn?
+ dn->dir->try_remove_unlinked_dn(dn);
}
// reply
MClientReply *reply = new MClientReply(mdr->client_request, 0);
reply_request(mdr, reply, dn->dir->get_inode()); // FIXME: imprecise ref
+
+ // removing a new dn?
+ dn->dir->try_remove_unlinked_dn(dn);
}
// update subtree map?
if (destdn->is_primary() && destdn->inode->is_dir())
mdcache->adjust_subtree_after_rename(destdn->inode, srcdn->dir);
+
+ // removing a new dn?
+ srcdn->dir->try_remove_unlinked_dn(srcdn);
}
if (mds->clientmap.get_version() >= cmapv) {
dout(10) << "ESession.replay clientmap " << mds->clientmap.get_version()
<< " >= " << cmapv << ", noop" << endl;
+
+ // hrm, this isn't very pretty.
+ if (!open)
+ mds->clientmap.trim_completed_requests(client_inst.name.num(), 0);
+
} else {
dout(10) << "ESession.replay clientmap " << mds->clientmap.get_version()
<< " < " << cmapv << endl;
assert(mds->clientmap.get_version() + 1 == cmapv);
- if (open)
+ if (open) {
mds->clientmap.open_session(client_inst);
- else
+ } else {
mds->clientmap.close_session(client_inst.name.num());
+ mds->clientmap.trim_completed_requests(client_inst.name.num(), 0);
+ }
mds->clientmap.reset_projected(); // make it follow version.
}
}