LDINC = ar -rc
else
# For linux
-CFLAGS = -g -fPIC -Wall -I. -D_FILE_OFFSET_BITS=64 -DMPICH_IGNORE_CXX_SEEK -D_REENTRANT -D_THREAD_SAFE
+CFLAGS = -g -fPIC -Wall -I. -D_FILE_OFFSET_BITS=64 -D_REENTRANT -D_THREAD_SAFE
LDINC = ld -i -o
endif
sage mds
+- journal+recovery
+ - local rename
+ - how to notify replicas...
+/ - stray purge
+ - stray reintegration
+ - remote link
+ - impl remote inode xlock
+ - ESlaveUpdate replay, resolution, etc.
+ - remote unlink
+ - remote rename
+ - file capabilities i/o
+- dirfrag split/merge
+ - client readdir for dirfrags
+- consistency points/snapshots
+ - dentry versions vs dirfrags...
+- statfs?
+
- finish multistage rejoin
+- trim_on_rejoin
- more testing of failures + thrashing.
- is export prep dir open deadlock properly fixed by forge_replica_dir()?
-
-- locker vs node failure
-- osd needs a set_floor_and_read op for safe failover/STOGITH-like semantics.
- failures during recovery stages (resolve, rejoin)... make sure rejoin still works!
-- fix mds initial osdmap weirdness (which will currently screw up on standby -> almost anything)
-- incremental mdsmaps?
-- client failure
- dirfrag split
- make sure we are freezing _before_ we fetch to complete the dirfrag, else
we break commit()'s preconditions when it fetches an incomplete dir.
+- detect and deal with client failure
+
+- recovering open files
+ - recovery will either have inode (from EOpen), or will provide path+cap to reassert open state.
+ - path+cap window will require some fetching of metadata from disk before doing the rejoin
+ - failures during migration.. what about client stale/reap stuff and misplaced WR caps?
+
+- inode.max_size
+
+- real chdir (directory "open")
+ - relative metadata ops
+
+
+
+- osd needs a set_floor_and_read op for safe failover/STOGITH-like semantics.
+
+- incremental mdsmaps?
+
- EMetablob should return 'expired' if they have higher versions (and are thus described by a newer journal entry)
- dir version/committed/etc versus migration, log expires.
- DOCUMENT.
- test open_remote_ino
-- scatterlock
- - unlink, link, rename need to pre_dirty and update dir inode's mtime
- - tho need to make sure that behaves when dirfrag's inode is non-auth...
-
- FIXME how to journal root and stray inode content?
- in particular, i care about dirfragtree.. get it on rejoin?
- and dir sizes, if i add that... also on rejoin?
-- recovering open files
- - recovery will either have inode (from EOpen), or will provide path+cap to reassert open state.
- - path+cap window will require some fetching of metadata from disk before doing the rejoin
- - failures during migration.. what about client stale/reap stuff and misplaced WR caps?
-- inode.max_size
-- journal+recovery
- - local rename
- - how to notify replicas...
-/ - stray purge
- - stray reintegration
- - remote link
- - impl remote inode xlock
- - ESlaveUpdate replay, resolution, etc.
- - remote unlink
- - rewrite to look link _link
- - remote rename
- - file capabilities i/o
-- filelock to control directory mtime, dentry changes
- - hmm, may have to change lock ordering, and Server::rdlock_path_pin_ref()
-- dirfrag split/merge
- - client readdir for dirfrags
-- consistency points/snapshots
- - dentry versions vs dirfrags...
-- real chdir (directory "open")
- - relative metadata ops
-- statfs?
-- fix lock caps gather ack versus ambiguous auth
foreign rename
objecter
+- transaction prepare/commit
- read+floor_lockout
osd/rados
+- transaction prepare/commit
+ - rollback
+ - rollback logging (to fix slow prepare vs rollback race)
- read+floor_lockout for clean STOGITH-like/fencing semantics after failover.
- separate out replication code into a PG class, to pave way for RAID
// --- journaler ---
journaler_allow_split_entries: true,
+ journaler_safe: false, // wait for COMMIT on journal writes
// --- mds ---
mds_cache_size: MDS_CACHE_SIZE,
mds_decay_halflife: 30,
mds_beacon_interval: 5.0,
- mds_beacon_grace: 100.0,
+ mds_beacon_grace: 10.0,
mds_log: true,
mds_log_max_len: MDS_CACHE_SIZE / 3,
mds_log_max_trimming: 10000,
mds_log_read_inc: 1<<20,
mds_log_pad_entry: 128,//256,//64,
- mds_log_before_reply: true,
mds_log_flush_on_shutdown: true,
mds_log_import_map_interval: 1024*1024, // frequency (in bytes) of EImportMap in log
mds_log_eopen_size: 100, // # open inodes per log entry
mds_bal_midchunk: .3, // any sub bigger than this taken in full
mds_bal_minchunk: .001, // never take anything smaller than this
+ mds_trim_on_rejoin: true,
mds_commit_on_shutdown: true,
mds_shutdown_check: 0, //30,
mds_shutdown_on_last_unmount: true,
ebofs_cloneable: false,
ebofs_verify: false,
ebofs_commit_ms: 2000, // 0 = no forced commit timeout (for debugging/tracing)
- ebofs_idle_commit_ms: 100, // 0 = no idle detection. use this -or- bdev_idle_kick_after_ms
+ ebofs_idle_commit_ms: 20, // 0 = no idle detection. use this -or- bdev_idle_kick_after_ms
ebofs_oc_size: 10000, // onode cache
ebofs_cc_size: 10000, // cnode cache
ebofs_bc_size: (80 *256), // 4k blocks, *256 for MB
else if (strcmp(args[i], "--objecter_buffer_uncommitted") == 0)
g_conf.objecter_buffer_uncommitted = atoi(args[++i]);
+ else if (strcmp(args[i], "--journaler_safe") == 0)
+ g_conf.journaler_safe = atoi(args[++i]);
+
else if (strcmp(args[i], "--mds_cache_size") == 0)
g_conf.mds_cache_size = atoi(args[++i]);
else if (strcmp(args[i], "--mds_log") == 0)
g_conf.mds_log = atoi(args[++i]);
- else if (strcmp(args[i], "--mds_log_before_reply") == 0)
- g_conf.mds_log_before_reply = atoi(args[++i]);
else if (strcmp(args[i], "--mds_log_max_len") == 0)
g_conf.mds_log_max_len = atoi(args[++i]);
else if (strcmp(args[i], "--mds_log_read_inc") == 0)
// journaler
bool journaler_allow_split_entries;
-
+ bool journaler_safe;
+
// mds
int mds_cache_size;
float mds_cache_mid;
int mds_log_max_trimming;
int mds_log_read_inc;
int mds_log_pad_entry;
- bool mds_log_before_reply;
bool mds_log_flush_on_shutdown;
off_t mds_log_import_map_interval;
int mds_log_eopen_size;
float mds_bal_midchunk;
float mds_bal_minchunk;
+ bool mds_trim_on_rejoin;
bool mds_commit_on_shutdown;
int mds_shutdown_check;
bool mds_shutdown_on_last_unmount;
+
bool mds_verify_export_dirauth; // debug flag
bool mds_local_osd;
#define dout(x) if ((x) <= g_conf.debug) std::cout
#define dout2(x) if ((x) <= g_conf.debug) std::cout
+#define pdout(x,p) if ((x) <= (p)) std::cout
+
/**
* for cleaner output, bracket each line with
* dbeginl (in the dout macro) and dendl (in place of endl).
void CDir::freeze_tree_finish(Context *c)
{
+ // still freezing? (we may have been canceled)
+ if (!is_freezing()) {
+ dout(10) << "freeze_tree_finish no longer freezing, done on " << *this << endl;
+ c->finish(-1);
+ delete c;
+ return;
+ }
+
// freezeable now?
if (!is_freezeable()) {
// wait again!
state_clear(STATE_FREEZINGTREE);
// cancel freeze waiters
+ finish_waiting(WAIT_UNFREEZE);
finish_waiting(WAIT_FREEZEABLE, -1);
}
}
out << " v" << in.get_version();
- out << " auth=" << in.authlock;
- out << " link=" << in.linklock;
- out << " dft=" << in.dirfragtreelock;
- out << " file=" << in.filelock;
- out << " dir=" << in.dirlock;
+ // locks
+ out << " " << in.authlock;
+ out << " " << in.linklock;
+ out << " " << in.dirfragtreelock;
+ out << " " << in.filelock;
+ out << " " << in.dirlock;
- if (in.get_num_ref()) {
- out << " |";
- in.print_pin_set(out);
- }
-
// hack: spit out crap on which clients have caps
if (!in.get_client_caps().empty()) {
out << " caps={";
}
out << "}";
}
+
+ if (in.get_num_ref()) {
+ out << " |";
+ in.print_pin_set(out);
+ }
+
out << " " << ∈
out << "]";
return out;
void print(ostream& out) {
out << "(";
- //out << get_lock_type_name(l.get_type()) << " ";
+ out << get_lock_type_name(get_type()) << " ";
out << get_filelock_state_name(get_state());
if (!get_gather_set().empty()) out << " g=" << get_gather_set();
if (is_rdlocked())
void Locker::dispatch(Message *m)
{
+
switch (m->get_type()) {
// locking
for (map<int,int>::iterator it = lock->get_parent()->replicas_begin();
it != lock->get_parent()->replicas_end();
it++) {
+ if (mds->mdsmap->get_state(it->first) < MDSMap::STATE_REJOIN)
+ continue;
MLock *m = new MLock(lock, msg, mds->get_nodeid());
mds->send_message_mds(m, it->first, MDS_PORT_LOCKER);
}
}
-void Locker::send_lock_message(SimpleLock *lock, int msg, bufferlist &data)
+void Locker::send_lock_message(SimpleLock *lock, int msg, const bufferlist &data)
{
for (map<int,int>::iterator it = lock->get_parent()->replicas_begin();
it != lock->get_parent()->replicas_end();
it++) {
+ if (mds->mdsmap->get_state(it->first) < MDSMap::STATE_REJOIN)
+ continue;
MLock *m = new MLock(lock, msg, mds->get_nodeid());
m->set_data(data);
mds->send_message_mds(m, it->first, MDS_PORT_LOCKER);
assert(!in->is_auth());
in->replica_caps_wanted = wanted;
- mds->send_message_mds(new MInodeFileCaps(in->ino(), mds->get_nodeid(),
- in->replica_caps_wanted),
- auth, MDS_PORT_LOCKER);
+
+ if (mds->mdsmap->get_state(auth) >= MDSMap::STATE_REJOIN)
+ mds->send_message_mds(new MInodeFileCaps(in->ino(), mds->get_nodeid(),
+ in->replica_caps_wanted),
+ auth, MDS_PORT_LOCKER);
} else {
in->replica_caps_wanted_keep_until.sec_ref() = 0;
}
void Locker::handle_inode_file_caps(MInodeFileCaps *m)
{
+ // nobody should be talking to us during recovery.
+ assert(mds->is_rejoin() || mds->is_active() || mds->is_stopping());
+
+ // ok
CInode *in = mdcache->get_inode(m->get_ino());
assert(in);
- assert(in->is_auth());// || in->is_proxy());
-
- dout(7) << "handle_inode_file_caps replica mds" << m->get_from() << " wants caps " << cap_string(m->get_caps()) << " on " << *in << endl;
+ assert(in->is_auth());
- /*if (in->is_proxy()) {
- dout(7) << "proxy, fw" << endl;
- mds->send_message_mds(m, in->authority().first, MDS_PORT_LOCKER);
+ if (mds->is_rejoin() &&
+ in->is_rejoining()) {
+ dout(7) << "handle_inode_file_caps still rejoining " << *in << ", dropping " << *m << endl;
+ delete m;
return;
}
- */
+
+
+ dout(7) << "handle_inode_file_caps replica mds" << m->get_from() << " wants caps " << cap_string(m->get_caps()) << " on " << *in << endl;
if (m->get_caps())
in->mds_caps_wanted[m->get_from()] = m->get_caps();
void Locker::handle_lock(MLock *m)
{
+ // nobody should be talking to us during recovery.
+ assert(mds->is_rejoin() || mds->is_active() || mds->is_stopping());
+
switch (m->get_otype()) {
case LOCK_OTYPE_DN:
{
{
int from = m->get_asker();
+ if (mds->is_rejoin()) {
+ if (lock->get_parent()->is_rejoining()) {
+ dout(7) << "handle_simple_lock still rejoining " << *lock->get_parent()
+ << ", dropping " << *m << endl;
+ delete m;
+ return;
+ }
+ }
+
switch (m->get_action()) {
// -- replica --
case LOCK_AC_SYNC:
dout(7) << "handle_simple_lock has reader, waiting before ack on " << *lock
<< " on " << *lock->get_parent() << endl;
lock->set_state(LOCK_GLOCKR);
- lock->add_waiter(SimpleLock::WAIT_NOLOCKS, new C_MDS_RetryMessage(mds, m));
- return;
+ } else {
+ // update lock and reply
+ lock->set_state(LOCK_LOCK);
+ mds->send_message_mds(new MLock(lock, LOCK_AC_LOCKACK, mds->get_nodeid()),
+ from, MDS_PORT_LOCKER);
}
-
- // update lock and reply
- lock->set_state(LOCK_LOCK);
-
- mds->send_message_mds(new MLock(lock, LOCK_AC_LOCKACK, mds->get_nodeid()),
- from, MDS_PORT_LOCKER);
break;
MDRequest *mdr = mdcache->request_get(m->get_reqid());
mdr->xlocks.insert(lock);
mdr->locks.insert(lock);
+ lock->set_state(LOCK_REMOTEXLOCK);
lock->finish_waiters(SimpleLock::WAIT_REMOTEXLOCK);
}
break;
}
+class C_Locker_SimpleEval : public Context {
+ Locker *locker;
+ SimpleLock *lock;
+public:
+ C_Locker_SimpleEval(Locker *l, SimpleLock *lk) : locker(l), lock(lk) {}
+ void finish(int r) {
+ locker->simple_eval(lock);
+ }
+};
+
void Locker::simple_eval(SimpleLock *lock)
{
- // finished gather?
- if (lock->get_parent()->is_auth() &&
- !lock->is_stable() &&
- !lock->is_gathering()) {
+ // unstable and ambiguous auth?
+ if (!lock->is_stable() &&
+ lock->get_parent()->is_ambiguous_auth()) {
+ dout(7) << "simple_eval not stable and ambiguous auth, waiting on " << *lock->get_parent() << endl;
+ //if (!lock->get_parent()->is_waiter(MDSCacheObject::WAIT_SINGLEAUTH))
+ lock->get_parent()->add_waiter(MDSCacheObject::WAIT_SINGLEAUTH, new C_Locker_SimpleEval(this, lock));
+ return;
+ }
+
+ // finished remote xlock?
+ if (lock->get_state() == LOCK_REMOTEXLOCK &&
+ !lock->is_xlocked()) {
+ // tell auth
+ assert(!lock->get_parent()->is_auth()); // should be auth_pinned on the auth
+ dout(7) << "simple_eval releasing remote xlock on " << *lock->get_parent() << endl;
+ int auth = lock->get_parent()->authority().first;
+ if (mds->mdsmap->get_state(auth) >= MDSMap::STATE_REJOIN)
+ mds->send_message_mds(new MLock(lock, LOCK_AC_UNXLOCK, mds->get_nodeid()),
+ auth, MDS_PORT_LOCKER);
+ lock->set_state(LOCK_LOCK);
+ }
+
+ // finished gathering?
+ if (lock->get_state() == LOCK_GLOCKR &&
+ !lock->is_gathering() &&
+ !lock->is_rdlocked()) {
dout(7) << "simple_eval finished gather on " << *lock << " on " << *lock->get_parent() << endl;
- switch (lock->get_state()) {
- case LOCK_GLOCKR:
- lock->set_state(LOCK_LOCK);
- lock->finish_waiters(SimpleLock::WAIT_STABLE);
- break;
-
- default:
- assert(0);
+
+ // replica: tell auth
+ if (!lock->get_parent()->is_auth()) {
+ int auth = lock->get_parent()->authority().first;
+ if (mds->mdsmap->get_state(auth) >= MDSMap::STATE_REJOIN)
+ mds->send_message_mds(new MLock(lock, LOCK_AC_LOCKACK, mds->get_nodeid()),
+ lock->get_parent()->authority().first, MDS_PORT_LOCKER);
}
- }
- if (!lock->is_stable()) return;
-
- if (lock->get_parent()->is_auth()) {
- // sync?
- if (lock->get_state() != LOCK_SYNC &&
- lock->get_parent()->is_replicated() &&
- !lock->is_waiter_for(SimpleLock::WAIT_WR)) {
- dout(7) << "simple_eval stable, syncing " << *lock
- << " on " << *lock->get_parent() << endl;
- simple_sync(lock);
- }
+ lock->set_state(LOCK_LOCK);
+ lock->finish_waiters(SimpleLock::WAIT_STABLE|SimpleLock::WAIT_WR);
+ }
- } else {
- // replica
+ // stable -> sync?
+ if (lock->get_parent()->is_auth() &&
+ lock->is_stable() &&
+ lock->get_state() != LOCK_SYNC &&
+ !lock->is_waiter_for(SimpleLock::WAIT_WR)) {
+ dout(7) << "simple_eval stable, syncing " << *lock
+ << " on " << *lock->get_parent() << endl;
+ simple_sync(lock);
}
}
if (lock->get_state() == LOCK_GLOCKR)
assert(0); // um... hmm!
assert(lock->get_state() == LOCK_LOCK);
-
- // hard data
- bufferlist data;
- lock->encode_locked_state(data);
-
- // bcast to replicas
- send_lock_message(lock, LOCK_AC_SYNC, data);
+
+ // sync.
+ if (lock->get_parent()->is_replicated()) {
+ // hard data
+ bufferlist data;
+ lock->encode_locked_state(data);
+
+ // bcast to replicas
+ send_lock_message(lock, LOCK_AC_SYNC, data);
+ }
// change lock
lock->set_state(LOCK_SYNC);
dout(7) << "simple_rdlock_finish on " << *lock << " on " << *lock->get_parent() << endl;
- if (lock->get_state() == LOCK_GLOCKR &&
- !lock->is_rdlocked()) {
- lock->set_state(LOCK_SYNC); // return state to sync, in case the unpinner flails
- lock->finish_waiters(SimpleLock::WAIT_NOLOCKS);
- }
+ // last one?
+ if (!lock->is_rdlocked())
+ simple_eval(lock);
}
bool Locker::simple_xlock_start(SimpleLock *lock, MDRequest *mdr)
new C_MDS_RetryRequest(mdcache, mdr));
return false;
}
+ int auth = lock->get_parent()->authority().first;
// wait for sync.
// (???????????)
}
// send lock request
- int auth = lock->get_parent()->authority().first;
MLock *m = new MLock(lock, LOCK_AC_REQXLOCK, mds->get_nodeid());
mds->send_message_mds(m, auth, MDS_PORT_LOCKER);
// drop ref
assert(lock->can_xlock(mdr));
lock->put_xlock();
+ assert(mdr);
mdr->xlocks.erase(lock);
mdr->locks.erase(lock);
dout(7) << "simple_xlock_finish on " << *lock << " on " << *lock->get_parent() << endl;
- // slave?
- if (!lock->get_parent()->is_auth()) {
- mds->send_message_mds(new MLock(lock, LOCK_AC_UNXLOCK, mds->get_nodeid()),
- lock->get_parent()->authority().first, MDS_PORT_LOCKER);
- }
-
// others waiting?
- if (lock->is_waiter_for(SimpleLock::WAIT_WR)) {
- // wake 'em up
- lock->finish_waiters(SimpleLock::WAIT_WR, 0);
- } else {
- // auto-sync if alone.
- if (lock->get_parent()->is_auth() &&
- !lock->get_parent()->is_replicated() &&
- lock->get_state() != LOCK_SYNC)
- lock->set_state(LOCK_SYNC);
-
- simple_eval(lock);
- }
+ lock->finish_waiters(SimpleLock::WAIT_WR, 0);
+
+ // eval
+ simple_eval(lock);
}
scatter_eval(lock);
}
+
+class C_Locker_ScatterEval : public Context {
+ Locker *locker;
+ ScatterLock *lock;
+public:
+ C_Locker_ScatterEval(Locker *l, ScatterLock *lk) : locker(l), lock(lk) {}
+ void finish(int r) {
+ locker->scatter_eval(lock);
+ }
+};
+
void Locker::scatter_eval(ScatterLock *lock)
{
+ // unstable and ambiguous auth?
+ if (!lock->is_stable() &&
+ lock->get_parent()->is_ambiguous_auth()) {
+ dout(7) << "scatter_eval not stable and ambiguous auth, waiting on " << *lock->get_parent() << endl;
+ //if (!lock->get_parent()->is_waiter(MDSCacheObject::WAIT_SINGLEAUTH))
+ lock->get_parent()->add_waiter(MDSCacheObject::WAIT_SINGLEAUTH, new C_Locker_ScatterEval(this, lock));
+ return;
+ }
+
if (!lock->get_parent()->is_auth()) {
// REPLICA
if (lock->get_state() == LOCK_GSYNCS &&
!lock->is_wrlocked()) {
dout(10) << "scatter_eval no wrlocks, acking sync" << endl;
- bufferlist data;
- lock->encode_locked_state(data);
- mds->send_message_mds(new MLock(lock, LOCK_AC_SYNCACK, mds->get_nodeid(), data),
- lock->get_parent()->authority().first, MDS_PORT_LOCKER);
+ int auth = lock->get_parent()->authority().first;
+ if (mds->mdsmap->get_state(auth) >= MDSMap::STATE_REJOIN) {
+ bufferlist data;
+ lock->encode_locked_state(data);
+ mds->send_message_mds(new MLock(lock, LOCK_AC_SYNCACK, mds->get_nodeid(), data),
+ auth, MDS_PORT_LOCKER);
+ }
lock->set_state(LOCK_SYNC);
+ lock->finish_waiters(ScatterLock::WAIT_STABLE); // ?
}
} else {
dout(7) << "scatter_eval finished gather/un-wrlock on " << *lock
<< " on " << *lock->get_parent() << endl;
lock->set_state(LOCK_SYNC);
- lock->finish_waiters(SimpleLock::WAIT_STABLE|SimpleLock::WAIT_RD|SimpleLock::WAIT_NOLOCKS);
+ lock->finish_waiters(ScatterLock::WAIT_STABLE|ScatterLock::WAIT_RD);
}
// gscatters -> scatter?
}
lock->set_state(LOCK_SCATTER);
- lock->finish_waiters(SimpleLock::WAIT_WR|SimpleLock::WAIT_STABLE);
+ lock->get_wrlock();
+ lock->finish_waiters(ScatterLock::WAIT_WR|ScatterLock::WAIT_STABLE);
+ lock->put_wrlock();
}
// waiting for rd?
if (lock->get_state() == LOCK_SCATTER &&
!lock->is_wrlocked() &&
- lock->is_waiter_for(SimpleLock::WAIT_RD)) {
+ lock->is_waiter_for(ScatterLock::WAIT_RD)) {
dout(10) << "scatter_eval no wrlocks, read waiter, syncing" << endl;
scatter_sync(lock);
}
}
else if (lock->is_wrlocked()) {
lock->set_state(LOCK_GSYNCS);
- } else {
+ }
+ else {
lock->set_state(LOCK_SYNC);
- lock->finish_waiters(SimpleLock::WAIT_RD|SimpleLock::WAIT_STABLE);
+ lock->finish_waiters(ScatterLock::WAIT_RD|ScatterLock::WAIT_STABLE);
}
}
send_lock_message(lock, LOCK_AC_SCATTER, data);
}
lock->set_state(LOCK_SCATTER);
- lock->finish_waiters(SimpleLock::WAIT_WR|SimpleLock::WAIT_STABLE);
+ lock->finish_waiters(ScatterLock::WAIT_WR|ScatterLock::WAIT_STABLE);
}
}
{
int from = m->get_asker();
+ if (mds->is_rejoin()) {
+ if (lock->get_parent()->is_rejoining()) {
+ dout(7) << "handle_scatter_lock still rejoining " << *lock->get_parent()
+ << ", dropping " << *m << endl;
+ delete m;
+ return;
+ }
+ }
+
switch (m->get_action()) {
// -- replica --
case LOCK_AC_SYNC:
assert(lock->get_state() == LOCK_SYNC);
lock->decode_locked_state(m->get_data());
lock->set_state(LOCK_SCATTER);
- lock->finish_waiters(SimpleLock::WAIT_WR|SimpleLock::WAIT_STABLE);
+ lock->finish_waiters(ScatterLock::WAIT_WR|ScatterLock::WAIT_STABLE);
break;
// -- for auth --
dout(7) << "handle_scatter_lock " << *lock << " on " << *lock->get_parent()
<< " from " << from << ", last one"
<< endl;
- simple_eval(lock);
+ scatter_eval(lock);
}
break;
}
dout(7) << "rdlock_finish on " << *lock << " on " << *lock->get_parent() << endl;
- if (!lock->is_rdlocked()) {
- lock->finish_waiters(SimpleLock::WAIT_NOLOCKS);
+ if (!lock->is_rdlocked())
file_eval(lock);
- }
}
dout(7) << "file_xlock_finish on " << *lock << " on " << *lock->get_parent() << endl;
assert(lock->get_parent()->is_auth()); // or implement remote xlocks
-
- // drop lock?
- if (!lock->is_waiter_for(SimpleLock::WAIT_STABLE))
- file_eval(lock);
+
+ // others waiting?
+ lock->finish_waiters(SimpleLock::WAIT_WR, 0);
+
+ //// drop lock?
+ //if (!lock->is_waiter_for(SimpleLock::WAIT_STABLE))
+ file_eval(lock);
}
* - checks if we're in unstable sfot state and can now move on to next state
* - checks if soft state should change (eg bc last writer closed)
*/
+class C_Locker_FileEval : public Context {
+ Locker *locker;
+ FileLock *lock;
+public:
+ C_Locker_FileEval(Locker *l, FileLock *lk) : locker(l), lock(lk) {}
+ void finish(int r) {
+ locker->file_eval(lock);
+ }
+};
+
void Locker::file_eval(FileLock *lock)
{
CInode *in = (CInode*)lock->get_parent();
+ // unstable and ambiguous auth?
+ if (!lock->is_stable() &&
+ in->is_ambiguous_auth()) {
+ dout(7) << "file_eval not stable and ambiguous auth, waiting on " << *in << endl;
+ //if (!lock->get_parent()->is_waiter(MDSCacheObject::WAIT_SINGLEAUTH))
+ in->add_waiter(CInode::WAIT_SINGLEAUTH, new C_Locker_FileEval(this, lock));
+ return;
+ }
+
+
int issued = in->get_caps_issued();
// [auth] finished gather?
// waiters
lock->get_rdlock();
- lock->finish_waiters(SimpleLock::WAIT_STABLE);
+ lock->finish_waiters(SimpleLock::WAIT_STABLE|SimpleLock::WAIT_WR|SimpleLock::WAIT_RD);
lock->put_rdlock();
}
break;
if (lock->get_state() == LOCK_LOCK) {
if (in->is_replicated()) {
- // soft data
bufferlist softdata;
lock->encode_locked_state(softdata);
-
- // bcast to replicas
send_lock_message(lock, LOCK_AC_SYNC, softdata);
}
-
+
// change lock
lock->set_state(LOCK_SYNC);
- // reissue caps
- issue_caps(in);
+ issue_caps(in); // reissue caps
return true;
}
issue_caps(in);
} else {
// no writers, go straight to sync
-
if (in->is_replicated()) {
- // bcast to replicas
- send_lock_message(lock, LOCK_AC_SYNC);
+ bufferlist softdata;
+ lock->encode_locked_state(softdata);
+ send_lock_message(lock, LOCK_AC_SYNC, softdata);
}
// change lock
} else {
// no writers, go straight to sync
if (in->is_replicated()) {
- // bcast to replicas
- send_lock_message(lock, LOCK_AC_SYNC);
+ bufferlist softdata;
+ lock->encode_locked_state(softdata);
+ send_lock_message(lock, LOCK_AC_SYNC, softdata);
}
// change lock
CInode *in = (CInode*)lock->get_parent();
int from = m->get_asker();
+ if (mds->is_rejoin()) {
+ if (in->is_rejoining()) {
+ dout(7) << "handle_file_lock still rejoining " << *in
+ << ", dropping " << *m << endl;
+ delete m;
+ return;
+ }
+ }
+
+
dout(7) << "handle_file_lock a=" << m->get_action() << " from " << from << " "
<< *in << " filelock=" << *lock << endl;
}
if (lock->is_rdlocked()) {
dout(7) << "handle_file_lock rdlocked, waiting before ack on " << *in << endl;
- in->add_waiter(SimpleLock::WAIT_NOLOCKS, new C_MDS_RetryMessage(mds, m));
lock->set_state(LOCK_GLOCKR);
- assert(0);// i am broken.. why retry message when state captures all the info i need?
- return;
+ break;
}
if (issued & CAP_FILE_RD) {
+ dout(7) << "handle_file_lock RD cap issued, waiting before ack on " << *in << endl;
lock->set_state(LOCK_GLOCKR);
break;
}
-
+
// nothing to wait for, lock and ack.
{
lock->set_state(LOCK_LOCK);
void handle_lock(MLock *m);
void send_lock_message(SimpleLock *lock, int msg);
- void send_lock_message(SimpleLock *lock, int msg, bufferlist &data);
+ void send_lock_message(SimpleLock *lock, int msg, const bufferlist &data);
// -- locks --
bool acquire_locks(MDRequest *mdr,
void MDBalancer::hit_inode(CInode *in, int type)
{
// hit me
- in->popularity[MDS_POP_JUSTME].pop[type].hit();
- in->popularity[MDS_POP_NESTED].pop[type].hit();
+ float me = in->popularity[MDS_POP_JUSTME].pop[type].hit();
+ float nested = in->popularity[MDS_POP_NESTED].pop[type].hit();
+ float curdom = 0;
+ float anydom = 0;
if (in->is_auth()) {
- in->popularity[MDS_POP_CURDOM].pop[type].hit();
- in->popularity[MDS_POP_ANYDOM].pop[type].hit();
+ curdom = in->popularity[MDS_POP_CURDOM].pop[type].hit();
+ anydom = in->popularity[MDS_POP_ANYDOM].pop[type].hit();
}
-
+
+ dout(20) << "hit_inode " << type << " pop " << me << " me, "
+ << nested << " nested, "
+ << curdom << " curdom, "
+ << anydom << " anydom"
+ << " on " << *in
+ << endl;
+
// hit auth up to import
CDir *dir = in->get_parent_dir();
if (dir) hit_dir(dir, type);
if (g_conf.num_mds > 2 && // FIXME >2 thing
!dir->inode->is_root() && // not root (for now at least)
dir->is_auth()) {
- //dout(-20) << "hit_dir " << type << " pop is " << v << " " << *dir << endl;
+ dout(20) << "hit_dir " << type << " pop " << v << " me "
+ << *dir << endl;
// hash this dir? (later?)
if (((v > g_conf.mds_bal_hash_rd && type == META_POP_IRD) ||
// replicate?
float dir_pop = dir->popularity[MDS_POP_CURDOM].pop[type].get(); // hmm??
+ dout(20) << "hit_recursive " << type << " pop " << dir_pop << " curdom " << *dir << endl;
+
if (dir->is_auth()) {
if (!dir->is_rep() &&
dir_pop >= g_conf.mds_bal_replicate_threshold) {
rd_adj = rdp / mds->get_mds_map()->get_num_mds() - rdp;
rd_adj /= 2.0; // temper somewhat
- dout(1) << "replicating dir " << *dir << " pop " << dir_pop << " .. rdp " << rdp << " adj " << rd_adj << endl;
+ dout(2) << "replicating dir " << *dir << " pop " << dir_pop << " .. rdp " << rdp << " adj " << rd_adj << endl;
dir->dir_rep = CDir::REP_ALL;
mds->mdcache->send_dir_updates(dir, true);
dir->is_rep() &&
dir_pop < g_conf.mds_bal_unreplicate_threshold) {
// unreplicate
- dout(1) << "unreplicating dir " << *dir << " pop " << dir_pop << endl;
+ dout(2) << "unreplicating dir " << *dir << " pop " << dir_pop << endl;
dir->dir_rep = CDir::REP_NONE;
mds->mdcache->send_dir_updates(dir);
#include "events/ESlaveUpdate.h"
#include "events/EString.h"
#include "events/EPurgeFinish.h"
+#include "events/EImportFinish.h"
#include "messages/MGenericMessage.h"
if (dir->authority().first != CDIR_AUTH_UNKNOWN) {
dout(10) << "ambiguous import auth known, must not be me " << *dir << endl;
cancel_ambiguous_import(q->first);
+ mds->mdlog->submit_entry(new EImportFinish(dir, false));
} else {
dout(10) << "ambiguous import auth unknown, must be me " << *dir << endl;
finish_ambiguous_import(q->first);
+ mds->mdlog->submit_entry(new EImportFinish(dir, true));
}
}
assert(my_ambiguous_imports.empty());
}
-/*
+/** recalc_auth_bits()
* once subtree auth is disambiguated, we need to adjust all the
- * auth (and dirty) bits in our cache before moving on.
+ * auth and dirty bits in our cache before moving on.
*/
void MDCache::recalc_auth_bits()
{
dout(7) << "recalc_auth_bits" << endl;
- for (hash_map<inodeno_t,CInode*>::iterator p = inode_map.begin();
- p != inode_map.end();
+ for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
+ p != subtrees.end();
++p) {
- CInode *in = p->second;
- if (in->authority().first == mds->get_nodeid())
- in->state_set(CInode::STATE_AUTH);
- else {
- in->state_clear(CInode::STATE_AUTH);
- if (in->is_dirty())
- in->mark_clean();
- }
+ list<CDir*> dfq; // dirfrag queue
+ dfq.push_back(p->first);
- if (in->parent) {
- if (in->parent->authority().first == mds->get_nodeid())
- in->parent->state_set(CDentry::STATE_AUTH);
- else {
- in->parent->state_clear(CDentry::STATE_AUTH);
- if (in->parent->is_dirty())
- in->parent->mark_clean();
- }
- }
+ bool auth = p->first->authority().first == mds->get_nodeid();
+ dout(10) << " subtree auth=" << auth << " for " << *p->first << endl;
- list<CDir*> ls;
- for (list<CDir*>::iterator p = ls.begin();
- p != ls.end();
- ++p) {
- CDir *dir = *p;
- if (dir->authority().first == mds->get_nodeid())
+ while (!dfq.empty()) {
+ CDir *dir = dfq.front();
+ dfq.pop_front();
+
+ // dir
+ if (auth)
dir->state_set(CDir::STATE_AUTH);
else {
+ dir->state_set(CDir::STATE_REJOINING);
dir->state_clear(CDir::STATE_AUTH);
if (dir->is_dirty())
dir->mark_clean();
}
+
+ // dentries in this dir
+ for (map<string,CDentry*>::iterator q = dir->items.begin();
+ q != dir->items.end();
+ ++q) {
+ // dn
+ CDentry *dn = q->second;
+ if (auth)
+ dn->state_set(CDentry::STATE_AUTH);
+ else {
+ dn->state_set(CDentry::STATE_REJOINING);
+ dn->state_clear(CDentry::STATE_AUTH);
+ if (dn->is_dirty())
+ dn->mark_clean();
+ }
+
+ if (dn->is_primary()) {
+ // inode
+ if (auth)
+ dn->inode->state_set(CInode::STATE_AUTH);
+ else {
+ dn->inode->state_set(CInode::STATE_REJOINING);
+ dn->inode->state_clear(CInode::STATE_AUTH);
+ if (dn->inode->is_dirty())
+ dn->inode->mark_clean();
+ }
+
+ // recurse?
+ if (dn->inode->is_dir())
+ dn->inode->get_nested_dirfrags(dfq);
+ }
+ }
}
}
+
show_subtrees();
show_cache();
}
in->authlock.get_state(),
in->linklock.get_state(),
in->dirfragtreelock.get_state(),
- in->filelock.get_state());
+ in->filelock.get_state(),
+ in->dirlock.get_state());
if (in->authlock.is_xlocked())
rejoin->add_inode_xlock(in->ino(), in->authlock.get_type(),
in->authlock.get_xlocked_by()->reqid);
if (mds->is_active() || mds->is_stopping()) {
dout(10) << "i am active. removing stale cache replicas" << endl;
- // first, scour cache of replica references
+ // first, scour cache of unmentioned replica references
for (hash_map<inodeno_t,CInode*>::iterator p = inode_map.begin();
p != inode_map.end();
++p) {
if (dn) {
int nonce = dn->add_replica(from);
dout(10) << " have " << *dn << endl;
- ack->add_strong_dentry(*p, *q, dn->lock.get_state(), nonce);
+ if (ack)
+ ack->add_strong_dentry(*p, *q, dn->lock.get_state(), nonce);
} else {
dout(10) << " missing " << *p << " " << *q << endl;
if (!missing) missing = new MMDSCacheRejoin(MMDSCacheRejoin::OP_MISSING);
if (in) {
int nonce = in->add_replica(from);
in->mds_caps_wanted.erase(from);
- in->authlock.remove_gather(from); // just in case
- in->linklock.remove_gather(from); // just in case
- in->dirfragtreelock.remove_gather(from); // just in case
- in->filelock.remove_gather(from); // just in case
dout(10) << " have (weak) " << *in << endl;
- if (ack)
+ if (ack) {
+ in->authlock.remove_gather(from);
+ in->linklock.remove_gather(from);
+ in->dirfragtreelock.remove_gather(from);
+ in->filelock.remove_gather(from);
+ in->dirlock.remove_gather(from);
ack->add_strong_inode(in->ino(),
nonce,
0,
in->authlock.get_replica_state(),
in->linklock.get_replica_state(),
in->dirfragtreelock.get_replica_state(),
- in->filelock.get_replica_state());
+ in->filelock.get_replica_state(),
+ in->dirlock.get_replica_state());
+ }
} else {
dout(10) << " missing " << *p << endl;
if (!missing) missing = new MMDSCacheRejoin(MMDSCacheRejoin::OP_MISSING);
in->mds_caps_wanted[from] = p->second.caps_wanted;
else
in->mds_caps_wanted.erase(from);
- in->authlock.remove_gather(from); // just in case
- in->linklock.remove_gather(from); // just in case
- in->dirfragtreelock.remove_gather(from); // just in case
- in->filelock.remove_gather(from); // just in case
dout(10) << " have (strong) " << *in << endl;
if (ack) {
+ // i had inode, just tell replica the correct state
+ in->authlock.remove_gather(from);
+ in->linklock.remove_gather(from);
+ in->dirfragtreelock.remove_gather(from);
+ in->filelock.remove_gather(from);
+ in->dirlock.remove_gather(from);
ack->add_strong_inode(in->ino(),
nonce,
0,
in->authlock.get_replica_state(),
in->linklock.get_replica_state(),
in->dirfragtreelock.get_replica_state(),
- in->filelock.get_replica_state());
+ in->filelock.get_replica_state(),
+ in->dirlock.get_replica_state());
} else {
- // note strong replica filelock state requests
- //if (p->second.filelock & CAP_FILE_RD)
- //filelock_replica_readers.insert(in);
+ // take note of replica state values.
+ // SimpleLock --
+ // we can ignore; locked replicas can be safely changed to sync.
+ // FileLock --
+ // we can also ignore.
+ // replicas will at most issue RDCACHE|RD, which is covered by the default SYNC,
+ // so only _locally_ opened files are significant.
+ // ScatterLock -- adjust accordingly
+ if (p->second.dirlock == LOCK_SCATTER ||
+ p->second.dirlock == LOCK_GSCATTERS) // replica still has rdlocks
+ in->dirlock.set_state(LOCK_SCATTER);
}
} else {
dout(10) << " missing " << p->first << endl;
assert(dir);
dir->set_replica_nonce(p->second.nonce);
+ dir->state_clear(CDir::STATE_REJOINING);
dout(10) << " got " << *dir << endl;
// dentries
assert(dn);
dn->set_replica_nonce(q->second.nonce);
dn->lock.set_state(q->second.lock);
+ dn->state_clear(CDentry::STATE_REJOINING);
dout(10) << " got " << *dn << endl;
}
}
in->linklock.set_state(p->second.linklock);
in->dirfragtreelock.set_state(p->second.dirfragtreelock);
in->filelock.set_state(p->second.filelock);
+ in->dirlock.set_state(p->second.dirlock);
+ in->state_clear(CInode::STATE_REJOINING);
dout(10) << " got " << *in << endl;
}
p != m->weak_dirfrags.end();
++p) {
CDir *dir = get_dirfrag(*p);
- assert(dir);
+ if (!dir) {
+ dout(10) << " don't have dirfrag " << *p << endl;
+ continue; // we must have trimmed it after the original rejoin
+ }
+
dout(10) << " sending " << *dir << endl;
// dentries
q != m->weak_dentries[*p].end();
++q) {
CDentry *dn = dir->lookup(*q);
- assert(dn);
+ if (!dn) {
+ dout(10) << " don't have dentry " << *q << " in " << *dir << endl;
+ continue; // we must have trimmed it after our original rejoin
+ }
dout(10) << " sending " << *dn << endl;
if (mds->is_rejoin())
full->add_weak_dentry(*p, *q);
p != m->weak_inodes.end();
++p) {
CInode *in = get_inode(*p);
- assert(in);
+ if (!in) {
+ dout(10) << " don't have inode " << *p << endl;
+ continue; // we must have trimmed it after the originalo rejoin
+ }
dout(10) << " sending " << *in << endl;
full->add_full_inode(in->inode, in->symlink, in->dirfragtree);
in->authlock.get_replica_state(),
in->linklock.get_replica_state(),
in->dirfragtreelock.get_replica_state(),
- in->filelock.get_replica_state());
+ in->filelock.get_replica_state(),
+ in->dirlock.get_replica_state());
}
mds->send_message_mds(full, m->get_source().num(), MDS_PORT_CACHE);
void MDCache::handle_cache_rejoin_full(MMDSCacheRejoin *m)
{
+ dout(7) << "handle_cache_rejoin_full from " << m->get_source() << endl;
+
+
assert(0); // write me
+
+
+ delete m;
}
void MDCache::send_cache_rejoin_acks()
in->authlock.get_replica_state(),
in->linklock.get_replica_state(),
in->dirfragtreelock.get_replica_state(),
- in->filelock.get_replica_state());
+ in->filelock.get_replica_state(),
+ in->dirlock.get_replica_state());
}
// subdirs in this subtree?
// ===============================================================================
+/*
void MDCache::rename_file(CDentry *srcdn,
CDentry *destdn)
{
// link inode w/ dentry
destdn->dir->link_inode( destdn, in );
}
-
+*/
void MDCache::set_root(CInode *in)
p != dir->items.end();
++p) {
CDentry *dn = p->second;
- dout(7) << " dentry " << *dn << endl;
+ dout(7) << " dentry " << *dn << endl;
if (dn->is_primary() && dn->inode)
- dout(7) << " inode " << *dn->inode << endl;
+ dout(7) << " inode " << *dn->inode << endl;
}
}
}
void MDS::handle_mds_map(MMDSMap *m)
{
+ version_t hadepoch = mdsmap->get_epoch();
version_t epoch = m->get_epoch();
dout(5) << "handle_mds_map epoch " << epoch << " from " << m->get_source() << endl;
}
*/
+ // just got mdsmap+osdmap?
+ if (hadepoch == 0 &&
+ mdsmap->get_epoch() > 0 &&
+ osdmap->get_epoch() > 0)
+ boot();
+
delete m;
}
void MDS::handle_osd_map(MOSDMap *m)
{
- version_t had = osdmap->get_epoch();
+ version_t hadepoch = osdmap->get_epoch();
+ dout(10) << "handle_osd_map had " << hadepoch << endl;
- dout(10) << "handle_osd_map had " << had << endl;
-
- // process locally
+ // process
objecter->handle_osd_map(m);
- if (had == 0 && osdmap->get_epoch() > 0) {
- if (is_creating())
- boot_create(); // new tables, journal
- else if (is_starting())
- boot_start(); // old tables, empty journal
- else if (is_replay())
- boot_replay(); // replay, join
- else
- assert(is_standby());
- }
-
+ // just got mdsmap+osdmap?
+ if (hadepoch == 0 &&
+ osdmap->get_epoch() > 0 &&
+ mdsmap->get_epoch() > 0)
+ boot();
+}
+
+
+void MDS::boot()
+{
+ if (is_creating())
+ boot_create(); // new tables, journal
+ else if (is_starting())
+ boot_start(); // old tables, empty journal
+ else if (is_replay())
+ boot_replay(); // replay, join
+ else
+ assert(0);
}
// hack: thrash exports
for (int i=0; i<g_conf.mds_thrash_exports; i++) {
set<int> s;
+ if (!is_active()) break;
mdsmap->get_mds_set(s, MDSMap::STATE_ACTIVE);
if (s.size() < 2 || mdcache->get_num_inodes() < 10)
break; // need peers for this to work.
int init(bool standby=false);
void reopen_logger();
+ void boot();
void boot_create(); // i am new mds.
void boot_start(); // i am old but empty (was down:out) mds.
void boot_replay(int step=0); // i am recovering existing (down:failed) mds.
void print(ostream& out) {
out << "(";
- //out << get_lock_type_name(l.get_type()) << " ";
+ out << get_lock_type_name(get_type()) << " ";
out << get_scatterlock_state_name(get_state());
if (!get_gather_set().empty()) out << " g=" << get_gather_set();
if (is_rdlocked())
}
*/
+
+/** predirty_dn_diri
+ * predirty the directory inode for a new dentry, if it is auth (and not root)
+ * BUG: root inode doesn't get dirtied properly, currently. blech.
+ */
+version_t Server::predirty_dn_diri(CDentry *dn, EMetaBlob *blob, utime_t mtime)
+{
+ version_t dirpv = 0;
+ CInode *diri = dn->dir->inode;
+
+ if (diri->is_auth() && !diri->is_root()) {
+ dirpv = diri->pre_dirty();
+ inode_t *pi = blob->add_primary_dentry(diri->get_parent_dn(), true);
+ pi->version = dirpv;
+ pi->ctime = pi->mtime = mtime;
+ dout(10) << "predirty_dn_diri ctime/mtime " << mtime << " pv " << dirpv << " on " << *diri << endl;
+ }
+
+ return dirpv;
+}
+
+/** dirty_dn_diri
+ * follow-up with actual dirty of inode after journal entry commits.
+ */
+void Server::dirty_dn_diri(CDentry *dn, version_t dirpv, utime_t mtime)
+{
+ CInode *diri = dn->dir->inode;
+
+ // make the udpate
+ diri->inode.ctime = diri->inode.mtime = mtime;
+
+ if (diri->is_auth() && !diri->is_root()) {
+ // we're auth.
+ diri->mark_dirty(dirpv);
+ dout(10) << "dirty_dn_diri ctime/mtime " << mtime << " v " << diri->inode.version << " on " << *diri << endl;
+ } else {
+ // we're not auth. dirlock scatterlock will propagate the update.
+ }
+}
+
+
+
+
+
+
+
+
+
// ===============================================================================
// STAT
CDentry *dn;
CInode *newi;
version_t pv;
+ version_t dirpv;
public:
- C_MDS_mknod_finish(MDS *m, MDRequest *r, CDentry *d, CInode *ni) :
+ C_MDS_mknod_finish(MDS *m, MDRequest *r, CDentry *d, CInode *ni, version_t dirpv_) :
mds(m), mdr(r), dn(d), newi(ni),
- pv(d->get_projected_version()) {}
+ pv(d->get_projected_version()), dirpv(dirpv_) {}
void finish(int r) {
assert(r == 0);
newi->mark_dirty(pv);
// dir inode's mtime
- dn->get_dir()->get_inode()->inode.mtime = MAX(dn->get_dir()->get_inode()->inode.mtime,
- newi->inode.ctime);
-
+ mds->server->dirty_dn_diri(dn, dirpv, newi->inode.ctime);
+
// hit pop
mds->balancer->hit_inode(newi, META_POP_IWR);
}
};
+
+
void Server::handle_client_mknod(MDRequest *mdr)
{
MClientRequest *req = mdr->client_request();
newi->inode.mode |= INODE_MODE_FILE;
// prepare finisher
- C_MDS_mknod_finish *fin = new C_MDS_mknod_finish(mds, mdr, dn, newi);
EUpdate *le = new EUpdate("mknod");
le->metablob.add_client_req(req->get_reqid());
+
+ version_t dirpv = predirty_dn_diri(dn, &le->metablob, newi->inode.ctime); // dir mtime too
+
le->metablob.add_dir_context(dn->dir);
inode_t *pi = le->metablob.add_primary_dentry(dn, true, newi);
pi->version = dn->get_projected_version();
// log + wait
+ C_MDS_mknod_finish *fin = new C_MDS_mknod_finish(mds, mdr, dn, newi, dirpv);
mdlog->submit_entry(le);
mdlog->wait_for_sync(fin);
}
newdir->mark_dirty(newdir->pre_dirty());
// prepare finisher
- C_MDS_mknod_finish *fin = new C_MDS_mknod_finish(mds, mdr, dn, newi);
EUpdate *le = new EUpdate("mkdir");
le->metablob.add_client_req(req->get_reqid());
+ version_t dirpv = predirty_dn_diri(dn, &le->metablob, newi->inode.ctime); // dir mtime too
le->metablob.add_dir_context(dn->dir);
inode_t *pi = le->metablob.add_primary_dentry(dn, true, newi);
pi->version = dn->get_projected_version();
le->metablob.add_dir(newdir, true);
// log + wait
+ C_MDS_mknod_finish *fin = new C_MDS_mknod_finish(mds, mdr, dn, newi, dirpv);
mdlog->submit_entry(le);
mdlog->wait_for_sync(fin);
newi->symlink = req->get_sarg();
// prepare finisher
- C_MDS_mknod_finish *fin = new C_MDS_mknod_finish(mds, mdr, dn, newi);
EUpdate *le = new EUpdate("symlink");
le->metablob.add_client_req(req->get_reqid());
+ version_t dirpv = predirty_dn_diri(dn, &le->metablob, newi->inode.ctime); // dir mtime too
le->metablob.add_dir_context(dn->dir);
inode_t *pi = le->metablob.add_primary_dentry(dn, true, newi);
pi->version = dn->get_projected_version();
// log + wait
+ C_MDS_mknod_finish *fin = new C_MDS_mknod_finish(mds, mdr, dn, newi, dirpv);
mdlog->submit_entry(le);
mdlog->wait_for_sync(fin);
}
version_t dpv;
utime_t tctime;
version_t tpv;
+ version_t dirpv;
public:
- C_MDS_link_local_finish(MDS *m, MDRequest *r, CDentry *d, CInode *ti, utime_t ct) :
+ C_MDS_link_local_finish(MDS *m, MDRequest *r, CDentry *d, CInode *ti, version_t dirpv_, utime_t ct) :
mds(m), mdr(r), dn(d), targeti(ti),
dpv(d->get_projected_version()),
tctime(ct),
- tpv(targeti->get_parent_dn()->get_projected_version()) {}
+ tpv(targeti->get_parent_dn()->get_projected_version()),
+ dirpv(dirpv_) { }
void finish(int r) {
assert(r == 0);
- mds->server->_link_local_finish(mdr, dn, targeti, dpv, tctime, tpv);
+ mds->server->_link_local_finish(mdr, dn, targeti, dpv, tctime, tpv, dirpv);
}
};
version_t tpdv = targeti->pre_dirty();
// add to event
+ utime_t now = g_clock.real_now();
+ version_t dirpv = predirty_dn_diri(dn, &le->metablob, now); // dir inode's mtime
le->metablob.add_dir_context(dn->get_dir());
le->metablob.add_remote_dentry(dn, true, targeti->ino()); // new remote
le->metablob.add_dir_context(targeti->get_parent_dir());
// update journaled target inode
pi->nlink++;
- pi->ctime = g_clock.real_now();
+ pi->ctime = now;
pi->version = tpdv;
// finisher
- C_MDS_link_local_finish *fin = new C_MDS_link_local_finish(mds, mdr, dn, targeti, pi->ctime);
+ C_MDS_link_local_finish *fin = new C_MDS_link_local_finish(mds, mdr, dn, targeti, dirpv, now);
// log + wait
mdlog->submit_entry(le);
}
void Server::_link_local_finish(MDRequest *mdr, CDentry *dn, CInode *targeti,
- version_t dpv, utime_t tctime, version_t tpv)
+ version_t dpv, utime_t tctime, version_t tpv, version_t dirpv)
{
dout(10) << "_link_local_finish " << *dn << " to " << *targeti << endl;
targeti->mark_dirty(tpv);
// dir inode's mtime
- dn->get_dir()->get_inode()->inode.mtime = MAX(dn->get_dir()->get_inode()->inode.mtime,
- tctime);
+ dirty_dn_diri(dn, dirpv, tctime);
// bump target popularity
mds->balancer->hit_inode(targeti, META_POP_IWR);
CDentry *straydn;
version_t ipv; // referred inode
utime_t ictime;
- version_t dpv; // deleted dentry
+ version_t dnpv; // deleted dentry
+ version_t dirpv;
public:
C_MDS_unlink_local_finish(MDS *m, MDRequest *r, CDentry *d, CDentry *sd,
- version_t v, utime_t ct) :
+ version_t v, version_t dirpv_, utime_t ct) :
mds(m), mdr(r), dn(d), straydn(sd),
ipv(v), ictime(ct),
- dpv(d->get_projected_version()) { }
+ dnpv(d->get_projected_version()), dirpv(dirpv_) { }
void finish(int r) {
assert(r == 0);
- mds->server->_unlink_local_finish(mdr, dn, straydn, ipv, ictime, dpv);
+ mds->server->_unlink_local_finish(mdr, dn, straydn, ipv, ictime, dnpv, dirpv);
}
};
}
// the unlinked dentry
+ utime_t now = g_clock.real_now();
dn->pre_dirty();
+ version_t dirpv = predirty_dn_diri(dn, &le->metablob, now);
le->metablob.add_dir_context(dn->get_dir());
le->metablob.add_null_dentry(dn, true);
// update journaled target inode
pi->nlink--;
- pi->ctime = g_clock.real_now();
+ pi->ctime = now;
pi->version = ipv;
// finisher
C_MDS_unlink_local_finish *fin = new C_MDS_unlink_local_finish(mds, mdr, dn, straydn,
- ipv, pi->ctime);
+ ipv, dirpv, now);
journal_opens(); // journal pending opens, just in case
void Server::_unlink_local_finish(MDRequest *mdr,
CDentry *dn, CDentry *straydn,
- version_t ipv, utime_t ictime, version_t dpv)
+ version_t ipv, utime_t ictime, version_t dnpv, version_t dirpv)
{
dout(10) << "_unlink_local " << *dn << endl;
in->inode.ctime = ictime;
in->inode.nlink--;
in->mark_dirty(ipv); // dirty inode
- dn->mark_dirty(dpv); // dirty old dentry
+ dn->mark_dirty(dnpv); // dirty old dentry
// dir inode's mtime
- dn->get_dir()->get_inode()->inode.mtime = MAX(dn->get_dir()->get_inode()->inode.mtime,
- ictime);
+ dirty_dn_diri(dn, dirpv, ictime);
// share unlink news with replicas
for (map<int,int>::iterator it = dn->replicas_begin();
version_t straypv;
version_t destpv;
version_t srcpv;
+ version_t ddirpv, sdirpv;
utime_t ictime;
public:
version_t atid1;
version_t atid2;
C_MDS_rename_local_finish(MDS *m, MDRequest *r,
CDentry *sdn, CDentry *ddn, CDentry *stdn,
- version_t v, utime_t ct) :
+ version_t v, version_t ddirpv_, version_t sdirpv_, utime_t ct) :
mds(m), mdr(r),
srcdn(sdn), destdn(ddn), straydn(stdn),
ipv(v),
straypv(straydn ? straydn->get_projected_version():0),
destpv(destdn->get_projected_version()),
srcpv(srcdn->get_projected_version()),
+ ddirpv(ddirpv_), sdirpv(sdirpv_),
ictime(ct),
atid1(0), atid2(0) { }
void finish(int r) {
assert(r == 0);
mds->server->_rename_local_finish(mdr, srcdn, destdn, straydn,
- srcpv, destpv, straypv, ipv, ictime,
+ srcpv, destpv, straypv, ipv, ddirpv, sdirpv, ictime,
atid1, atid2);
}
};
EUpdate *le = new EUpdate("rename_local");
le->metablob.add_client_req(mdr->reqid);
+ utime_t now = g_clock.real_now();
+
CDentry *straydn = 0;
inode_t *pi = 0;
version_t ipv = 0;
// primary+remote link merge?
bool linkmerge = (srcdn->inode == destdn->inode &&
(srcdn->is_primary() || destdn->is_primary()));
+
+ // dir mtimes
+ version_t ddirpv = predirty_dn_diri(destdn, &le->metablob, now);
+ version_t sdirpv = predirty_dn_diri(srcdn, &le->metablob, now);
+
if (linkmerge) {
dout(10) << "will merge remote+primary links" << endl;
-
+
// destdn -> primary
le->metablob.add_dir_context(destdn->dir);
ipv = destdn->pre_dirty(destdn->inode->inode.version);
if (pi) {
// update journaled target inode
pi->nlink--;
- pi->ctime = g_clock.real_now();
+ pi->ctime = now;
pi->version = ipv;
}
C_MDS_rename_local_finish *fin = new C_MDS_rename_local_finish(mds, mdr,
srcdn, destdn, straydn,
- ipv, pi ? pi->ctime:utime_t());
+ ipv, ddirpv, sdirpv, now);
journal_opens(); // journal pending opens, just in case
void Server::_rename_local_finish(MDRequest *mdr,
CDentry *srcdn, CDentry *destdn, CDentry *straydn,
version_t srcpv, version_t destpv, version_t straypv, version_t ipv,
+ version_t ddirpv, version_t sdirpv,
utime_t ictime,
version_t atid1, version_t atid2)
{
bool linkmerge = (srcdn->inode == destdn->inode &&
(srcdn->is_primary() || destdn->is_primary()));
+ // dir mtimes
+ dirty_dn_diri(destdn, ddirpv, ictime);
+ dirty_dn_diri(srcdn, sdirpv, ictime);
+
if (linkmerge) {
assert(ipv);
+
if (destdn->is_primary()) {
dout(10) << "merging remote onto primary link" << endl;
<< " on " << *cur << endl;
// hit pop
- mds->balancer->hit_inode(cur, META_POP_IRD);
+ if (cmode == FILE_MODE_RW ||
+ cmode == FILE_MODE_W)
+ mds->balancer->hit_inode(cur, META_POP_IWR);
+ else
+ mds->balancer->hit_inode(cur, META_POP_IRD);
// reply
MClientReply *reply = new MClientReply(req, 0);
CDir* try_open_auth_dir(CInode *diri, frag_t fg, MDRequest *mdr);
//CDir* try_open_dir(CInode *diri, frag_t fg, MDRequest *mdr);
+ version_t predirty_dn_diri(CDentry *dn, class EMetaBlob *blob, utime_t mtime);
+ void dirty_dn_diri(CDentry *dn, version_t dirpv, utime_t mtime);
+
// requests on existing inodes.
void handle_client_stat(MDRequest *mdr);
void handle_client_utime(MDRequest *mdr);
void _link_local(MDRequest *mdr, CDentry *dn, CInode *targeti);
void _link_local_finish(MDRequest *mdr,
CDentry *dn, CInode *targeti,
- version_t, utime_t, version_t);
+ version_t, utime_t, version_t, version_t);
void _link_remote(MDRequest *mdr, CDentry *dn, CInode *targeti);
// unlink
void _unlink_local(MDRequest *mdr, CDentry *dn);
void _unlink_local_finish(MDRequest *mdr,
CDentry *dn, CDentry *straydn,
- version_t, utime_t, version_t);
+ version_t, utime_t, version_t, version_t);
void _unlink_remote(MDRequest *mdr, CDentry *dn);
// rename
void _rename_local_finish(MDRequest *mdr,
CDentry *srcdn, CDentry *destdn, CDentry *straydn,
version_t srcpv, version_t destpv, version_t straypv, version_t ipv,
- utime_t ictime,
+ version_t ddirpv, version_t sdirpv, utime_t ictime,
version_t atid1, version_t atid2);
};
inline const char *get_lock_type_name(int t) {
switch (t) {
- case LOCK_OTYPE_DN: return "dentry";
- case LOCK_OTYPE_IFILE: return "inode_file";
- case LOCK_OTYPE_IAUTH: return "inode_auth";
- case LOCK_OTYPE_ILINK: return "inode_link";
- case LOCK_OTYPE_IDIRFRAGTREE: return "inode_dirfragtree";
- case LOCK_OTYPE_IDIR: return "inode_dir";
+ case LOCK_OTYPE_DN: return "dn";
+ case LOCK_OTYPE_IFILE: return "ifile";
+ case LOCK_OTYPE_IAUTH: return "iauth";
+ case LOCK_OTYPE_ILINK: return "ilink";
+ case LOCK_OTYPE_IDIRFRAGTREE: return "idft";
+ case LOCK_OTYPE_IDIR: return "idir";
default: assert(0);
}
}
// -- lock states --
#define LOCK_UNDEF 0
-// auth rep
+// auth rep
#define LOCK_SYNC 1 // AR R . R .
#define LOCK_LOCK 2 // AR R W . .
#define LOCK_GLOCKR -3 // AR R . . .
+#define LOCK_REMOTEXLOCK -50 // on NON-auth
inline const char *get_simplelock_state_name(int n) {
switch (n) {
- case LOCK_UNDEF: return "undef";
+ case LOCK_UNDEF: return "UNDEF";
case LOCK_SYNC: return "sync";
case LOCK_LOCK: return "lock";
case LOCK_GLOCKR: return "glockr";
+ case LOCK_REMOTEXLOCK: return "remote_xlock";
default: assert(0);
}
}
public:
static const int WAIT_RD = (1<<0); // to read
static const int WAIT_WR = (1<<1); // to write
- static const int WAIT_NOLOCKS = (1<<2); // for last rdlock to finish
- //static const int WAIT_LOCK = (1<<3); // for locked state
+ static const int WAIT_SINGLEAUTH = (1<<2);
static const int WAIT_STABLE = (1<<3); // for a stable state
static const int WAIT_REMOTEXLOCK = (1<<4); // for a remote xlock
static const int WAIT_BITS = 5;
virtual void print(ostream& out) {
out << "(";
- //out << get_lock_type_name(l.get_type()) << " ";
+ out << get_lock_type_name(get_type()) << " ";
out << get_simplelock_state_name(get_state());
if (!get_gather_set().empty()) out << " g=" << get_gather_set();
if (is_rdlocked())
set<dirfrag_t> &get_bounds() { return bounds; }
void print(ostream& out) {
- out << "export " << base << " " << metablob;
+ out << "EExport " << base << " " << metablob;
}
virtual void encode_payload(bufferlist& bl) {
EImportFinish() : LogEvent(EVENT_IMPORTFINISH) { }
void print(ostream& out) {
- out << "import_finish " << base;
+ out << "EImportFinish " << base;
if (success)
out << " success";
else
}
// -- state --
- const static int STATE_AUTH = (1<<30);
- const static int STATE_DIRTY = (1<<29);
+ const static int STATE_AUTH = (1<<30);
+ const static int STATE_DIRTY = (1<<29);
+ const static int STATE_REJOINING = (1<<28); // replica has not joined w/ primary copy
// -- wait --
const static int WAIT_SINGLEAUTH = (1<<30);
void state_reset(unsigned s) { state = s; }
bool is_auth() { return state_test(STATE_AUTH); }
- bool is_dirty() { return state & STATE_DIRTY; }
+ bool is_dirty() { return state_test(STATE_DIRTY); }
bool is_clean() { return !is_dirty(); }
+ bool is_rejoining() { return state_test(STATE_REJOINING); }
// --------------------------------------------
// authority
if (waiting.empty())
get(PIN_WAITER);
waiting.insert(pair<int,Context*>(mask, c));
- dout(10) << (mdsco_db_line_prefix(this))
+ pdout(10,g_conf.debug_mds) << (mdsco_db_line_prefix(this))
<< "add_waiter " << mask << " " << c
<< " on " << *this
<< endl;
while (it != waiting.end()) {
if (it->first & mask) {
ls.push_back(it->second);
- dout(10) << (mdsco_db_line_prefix(this))
+ pdout(10,g_conf.debug_mds) << (mdsco_db_line_prefix(this))
<< "take_waiting mask " << mask << " took " << it->second
<< " tag " << it->first
<< " on " << *this
<< endl;
waiting.erase(it++);
} else {
- dout(10) << "take_waiting mask " << mask << " SKIPPING " << it->second
+ pdout(10,g_conf.debug_mds) << "take_waiting mask " << mask << " SKIPPING " << it->second
<< " tag " << it->first
<< " on " << *this
<< endl;
virtual char *get_type_name() { return "creply"; }
void print(ostream& o) {
o << "creply(" << env.dst.name << "." << st.tid;
- if (st.result) o << " = " << st.result;
+ o << " = " << st.result;
+ if (st.result <= 0)
+ o << " " << strerror(-st.result);
o << ")";
}
#define __MLOCK_H
#include "msg/Message.h"
-
+#include "mds/SimpleLock.h"
// for replicas
#define LOCK_AC_SYNC -1
data.claim(bl);
}
virtual char *get_type_name() { return "ILock"; }
+ void print(ostream& out) {
+ out << "lock(a=" << action
+ << " " << ino
+ << " " << get_lock_type_name(otype)
+ << ")";
+ }
void set_ino(inodeno_t ino, char ot) {
otype = ot;
this->dn = dn;
}
void set_reqid(metareqid_t ri) { reqid = ri; }
- void set_data(bufferlist& data) {
- this->data.claim( data );
+ void set_data(const bufferlist& data) {
+ this->data = data;
}
void decode_payload() {
int off = 0;
- payload.copy(off,sizeof(action), (char*)&action);
- off += sizeof(action);
- payload.copy(off,sizeof(asker), (char*)&asker);
- off += sizeof(asker);
- payload.copy(off,sizeof(otype), (char*)&otype);
- off += sizeof(otype);
- payload.copy(off,sizeof(ino), (char*)&ino);
- off += sizeof(ino);
- payload.copy(off,sizeof(dirfrag), (char*)&dirfrag);
- off += sizeof(dirfrag);
+ ::_decode(action, payload, off);
+ ::_decode(asker, payload, off);
+ ::_decode(otype, payload, off);
+ ::_decode(ino, payload, off);
+ ::_decode(dirfrag, payload, off);
::_decode(reqid, payload, off);
::_decode(dn, payload, off);
::_decode(data, payload, off);
}
virtual void encode_payload() {
- payload.append((char*)&action, sizeof(action));
- payload.append((char*)&asker, sizeof(asker));
- payload.append((char*)&otype, sizeof(otype));
- payload.append((char*)&ino, sizeof(ino));
- payload.append((char*)&dirfrag, sizeof(dirfrag));
+ ::_encode(action, payload);
+ ::_encode(asker, payload);
+ ::_encode(otype, payload);
+ ::_encode(ino, payload);
+ ::_encode(dirfrag, payload);
::_encode(reqid, payload);
::_encode(dn, payload);
::_encode(data, payload);
int32_t linklock;
int32_t dirfragtreelock;
int32_t filelock;
+ __int32_t dirlock;
inode_strong() {}
- inode_strong(int n, int cw=0, int a=0, int l=0, int dft=0, int f=0) :
+ inode_strong(int n, int cw=0, int a=0, int l=0, int dft=0, int f=0, int dl=0) :
caps_wanted(cw),
nonce(n),
- authlock(a), linklock(l), dirfragtreelock(dft), filelock(f) { }
+ authlock(a), linklock(l), dirfragtreelock(dft), filelock(f), dirlock(dl) { }
};
struct inode_full {
inode_t inode;
void add_weak_inode(inodeno_t ino) {
weak_inodes.insert(ino);
}
- void add_strong_inode(inodeno_t i, int n, int cw, int a, int l, int dft, int f) {
- strong_inodes[i] = inode_strong(n, cw, a, l, dft, f);
+ void add_strong_inode(inodeno_t i, int n, int cw, int a, int l, int dft, int f, int dl) {
+ strong_inodes[i] = inode_strong(n, cw, a, l, dft, f, dl);
}
void add_full_inode(inode_t &i, const string& s, const fragtree_t &f) {
full_inodes.push_back(inode_full(i, s, f));
bufferlist bl;
bl.append((char*)&last_written, sizeof(last_written));
filer.write(inode, 0, bl.length(), bl, 0,
- 0, new C_WriteHead(this, last_written, oncommit));
+ 0,
+ new C_WriteHead(this, last_written, oncommit));
}
void Journaler::_finish_write_head(Header &wrote, Context *oncommit)
dout(10) << "flush flushing " << flush_pos << "~" << len << endl;
// submit write for anything pending
+ // flush _start_ pos to _finish_flush
filer.write(inode, flush_pos, len, write_buf, 0,
- new C_Flush(this, flush_pos), 0); // flush _start_ pos to _finish_flush
+ g_conf.journaler_safe ? 0:new C_Flush(this, flush_pos), // on ACK
+ g_conf.journaler_safe ? new C_Flush(this, flush_pos):0); // on COMMIT
pending_flush[flush_pos] = g_clock.now();
// adjust pointers