From af8a786079c8d5db2ecffd6c59478393685fcf71 Mon Sep 17 00:00:00 2001 From: sageweil Date: Thu, 7 Jun 2007 00:27:45 +0000 Subject: [PATCH] merged trunk changes r1397:1408 into branches/sage/pgs git-svn-id: https://ceph.svn.sf.net/svnroot/ceph@1409 29311d96-e01e-0410-9327-a35deaab8ce9 --- branches/sage/pgs/Makefile | 2 +- branches/sage/pgs/TODO | 77 ++--- branches/sage/pgs/client/Client.cc | 4 +- branches/sage/pgs/config.cc | 12 +- branches/sage/pgs/config.h | 8 +- branches/sage/pgs/mds/CDir.cc | 9 + branches/sage/pgs/mds/CInode.cc | 22 +- branches/sage/pgs/mds/FileLock.h | 2 +- branches/sage/pgs/mds/Locker.cc | 323 ++++++++++++------- branches/sage/pgs/mds/Locker.h | 2 +- branches/sage/pgs/mds/MDBalancer.cc | 28 +- branches/sage/pgs/mds/MDCache.cc | 167 +++++++--- branches/sage/pgs/mds/MDS.cc | 44 ++- branches/sage/pgs/mds/MDS.h | 1 + branches/sage/pgs/mds/ScatterLock.h | 2 +- branches/sage/pgs/mds/Server.cc | 143 ++++++-- branches/sage/pgs/mds/Server.h | 9 +- branches/sage/pgs/mds/SimpleLock.h | 23 +- branches/sage/pgs/mds/events/EExport.h | 2 +- branches/sage/pgs/mds/events/EImportFinish.h | 2 +- branches/sage/pgs/mds/mdstypes.h | 14 +- branches/sage/pgs/messages/MClientReply.h | 4 +- branches/sage/pgs/messages/MLock.h | 37 +-- branches/sage/pgs/messages/MMDSCacheRejoin.h | 9 +- branches/sage/pgs/messages/MOSDMap.h | 3 + branches/sage/pgs/mon/OSDMonitor.cc | 20 +- branches/sage/pgs/mon/OSDMonitor.h | 4 +- branches/sage/pgs/osdc/Journaler.cc | 7 +- 28 files changed, 651 insertions(+), 329 deletions(-) diff --git a/branches/sage/pgs/Makefile b/branches/sage/pgs/Makefile index e85e18b56670f..2321d7068063d 100644 --- a/branches/sage/pgs/Makefile +++ b/branches/sage/pgs/Makefile @@ -12,7 +12,7 @@ CFLAGS = -g -Wall -I. -D_FILE_OFFSET_BITS=64 -D_REENTRANT -D_THREAD_SAFE -DDARWI LDINC = ar -rc else # For linux -CFLAGS = -g -Wall -I. -D_FILE_OFFSET_BITS=64 -DMPICH_IGNORE_CXX_SEEK -D_REENTRANT -D_THREAD_SAFE +CFLAGS = -g -Wall -I. -D_FILE_OFFSET_BITS=64 -D_REENTRANT -D_THREAD_SAFE LDINC = ld -i -o endif diff --git a/branches/sage/pgs/TODO b/branches/sage/pgs/TODO index 2925ba907ebd2..e288f15e09c6c 100644 --- a/branches/sage/pgs/TODO +++ b/branches/sage/pgs/TODO @@ -46,22 +46,52 @@ sage doc sage mds +- journal+recovery + - local rename + - how to notify replicas... +/ - stray purge + - stray reintegration + - remote link + - impl remote inode xlock + - ESlaveUpdate replay, resolution, etc. + - remote unlink + - remote rename + - file capabilities i/o +- dirfrag split/merge + - client readdir for dirfrags +- consistency points/snapshots + - dentry versions vs dirfrags... +- statfs? + - finish multistage rejoin +- trim_on_rejoin - more testing of failures + thrashing. - is export prep dir open deadlock properly fixed by forge_replica_dir()? - -- locker vs node failure -- osd needs a set_floor_and_read op for safe failover/STOGITH-like semantics. - failures during recovery stages (resolve, rejoin)... make sure rejoin still works! -- fix mds initial osdmap weirdness (which will currently screw up on standby -> almost anything) -- incremental mdsmaps? -- client failure - dirfrag split - make sure we are freezing _before_ we fetch to complete the dirfrag, else we break commit()'s preconditions when it fetches an incomplete dir. +- detect and deal with client failure + +- recovering open files + - recovery will either have inode (from EOpen), or will provide path+cap to reassert open state. + - path+cap window will require some fetching of metadata from disk before doing the rejoin + - failures during migration.. what about client stale/reap stuff and misplaced WR caps? + +- inode.max_size + +- real chdir (directory "open") + - relative metadata ops + + + +- osd needs a set_floor_and_read op for safe failover/STOGITH-like semantics. + +- incremental mdsmaps? + - EMetablob should return 'expired' if they have higher versions (and are thus described by a newer journal entry) - dir version/committed/etc versus migration, log expires. - DOCUMENT. @@ -77,45 +107,14 @@ sage mds - test open_remote_ino -- scatterlock - - unlink, link, rename need to pre_dirty and update dir inode's mtime - - tho need to make sure that behaves when dirfrag's inode is non-auth... - - FIXME how to journal root and stray inode content? - in particular, i care about dirfragtree.. get it on rejoin? - and dir sizes, if i add that... also on rejoin? -- recovering open files - - recovery will either have inode (from EOpen), or will provide path+cap to reassert open state. - - path+cap window will require some fetching of metadata from disk before doing the rejoin - - failures during migration.. what about client stale/reap stuff and misplaced WR caps? -- inode.max_size -- journal+recovery - - local rename - - how to notify replicas... -/ - stray purge - - stray reintegration - - remote link - - impl remote inode xlock - - ESlaveUpdate replay, resolution, etc. - - remote unlink - - rewrite to look link _link - - remote rename - - file capabilities i/o -- filelock to control directory mtime, dentry changes - - hmm, may have to change lock ordering, and Server::rdlock_path_pin_ref() -- dirfrag split/merge - - client readdir for dirfrags -- consistency points/snapshots - - dentry versions vs dirfrags... -- real chdir (directory "open") - - relative metadata ops -- statfs? -- fix lock caps gather ack versus ambiguous auth foreign rename @@ -215,9 +214,13 @@ rados snapshots objecter +- transaction prepare/commit - read+floor_lockout osd/rados +- transaction prepare/commit + - rollback + - rollback logging (to fix slow prepare vs rollback race) - read+floor_lockout for clean STOGITH-like/fencing semantics after failover. - efficiently replicate clone() objects - flag missing log entries on crash recovery --> WRNOOP? or WRLOST? diff --git a/branches/sage/pgs/client/Client.cc b/branches/sage/pgs/client/Client.cc index f97aaff578833..90e37a58a6ac2 100644 --- a/branches/sage/pgs/client/Client.cc +++ b/branches/sage/pgs/client/Client.cc @@ -1264,7 +1264,9 @@ int Client::mount() dout(2) << "sending client_mount to mon" << mon << endl; messenger->send_message(new MClientMount, monmap->get_inst(mon)); - while (!mdsmap) + while (!mdsmap || + !osdmap || + osdmap->get_epoch() == 0) mount_cond.Wait(client_lock); mounted = true; diff --git a/branches/sage/pgs/config.cc b/branches/sage/pgs/config.cc index fa06c4d66ccd6..d6a21af1d03cd 100644 --- a/branches/sage/pgs/config.cc +++ b/branches/sage/pgs/config.cc @@ -152,6 +152,7 @@ md_config_t g_conf = { // --- journaler --- journaler_allow_split_entries: true, + journaler_safe: false, // wait for COMMIT on journal writes // --- mds --- mds_cache_size: MDS_CACHE_SIZE, @@ -160,14 +161,13 @@ md_config_t g_conf = { mds_decay_halflife: 30, mds_beacon_interval: 5.0, - mds_beacon_grace: 100.0, + mds_beacon_grace: 10.0, mds_log: true, mds_log_max_len: MDS_CACHE_SIZE / 3, mds_log_max_trimming: 10000, mds_log_read_inc: 1<<20, mds_log_pad_entry: 128,//256,//64, - mds_log_before_reply: true, mds_log_flush_on_shutdown: true, mds_log_import_map_interval: 1024*1024, // frequency (in bytes) of EImportMap in log mds_log_eopen_size: 100, // # open inodes per log entry @@ -191,6 +191,7 @@ md_config_t g_conf = { mds_bal_midchunk: .3, // any sub bigger than this taken in full mds_bal_minchunk: .001, // never take anything smaller than this + mds_trim_on_rejoin: true, mds_commit_on_shutdown: true, mds_shutdown_check: 0, //30, mds_shutdown_on_last_unmount: true, @@ -243,7 +244,7 @@ md_config_t g_conf = { ebofs_cloneable: false, ebofs_verify: false, ebofs_commit_ms: 2000, // 0 = no forced commit timeout (for debugging/tracing) - ebofs_idle_commit_ms: 100, // 0 = no idle detection. use this -or- bdev_idle_kick_after_ms + ebofs_idle_commit_ms: 20, // 0 = no idle detection. use this -or- bdev_idle_kick_after_ms ebofs_oc_size: 10000, // onode cache ebofs_cc_size: 10000, // cnode cache ebofs_bc_size: (80 *256), // 4k blocks, *256 for MB @@ -575,6 +576,9 @@ void parse_config_options(std::vector& args) else if (strcmp(args[i], "--objecter_buffer_uncommitted") == 0) g_conf.objecter_buffer_uncommitted = atoi(args[++i]); + else if (strcmp(args[i], "--journaler_safe") == 0) + g_conf.journaler_safe = atoi(args[++i]); + else if (strcmp(args[i], "--mds_cache_size") == 0) g_conf.mds_cache_size = atoi(args[++i]); @@ -585,8 +589,6 @@ void parse_config_options(std::vector& args) else if (strcmp(args[i], "--mds_log") == 0) g_conf.mds_log = atoi(args[++i]); - else if (strcmp(args[i], "--mds_log_before_reply") == 0) - g_conf.mds_log_before_reply = atoi(args[++i]); else if (strcmp(args[i], "--mds_log_max_len") == 0) g_conf.mds_log_max_len = atoi(args[++i]); else if (strcmp(args[i], "--mds_log_read_inc") == 0) diff --git a/branches/sage/pgs/config.h b/branches/sage/pgs/config.h index 31d6d3a0f5ae4..7110a5ad6ea86 100644 --- a/branches/sage/pgs/config.h +++ b/branches/sage/pgs/config.h @@ -154,7 +154,8 @@ struct md_config_t { // journaler bool journaler_allow_split_entries; - + bool journaler_safe; + // mds int mds_cache_size; float mds_cache_mid; @@ -169,7 +170,6 @@ struct md_config_t { int mds_log_max_trimming; int mds_log_read_inc; int mds_log_pad_entry; - bool mds_log_before_reply; bool mds_log_flush_on_shutdown; off_t mds_log_import_map_interval; int mds_log_eopen_size; @@ -193,9 +193,11 @@ struct md_config_t { float mds_bal_midchunk; float mds_bal_minchunk; + bool mds_trim_on_rejoin; bool mds_commit_on_shutdown; int mds_shutdown_check; bool mds_shutdown_on_last_unmount; + bool mds_verify_export_dirauth; // debug flag bool mds_local_osd; @@ -334,6 +336,8 @@ extern md_config_t g_debug_after_conf; #define dout(x) if ((x) <= g_conf.debug) std::cout #define dout2(x) if ((x) <= g_conf.debug) std::cout +#define pdout(x,p) if ((x) <= (p)) std::cout + /** * for cleaner output, bracket each line with * dbeginl (in the dout macro) and dendl (in place of endl). diff --git a/branches/sage/pgs/mds/CDir.cc b/branches/sage/pgs/mds/CDir.cc index 4ef7d36c4cb97..41d6f6b81749b 100644 --- a/branches/sage/pgs/mds/CDir.cc +++ b/branches/sage/pgs/mds/CDir.cc @@ -1191,6 +1191,14 @@ void CDir::freeze_tree(Context *c) void CDir::freeze_tree_finish(Context *c) { + // still freezing? (we may have been canceled) + if (!is_freezing()) { + dout(10) << "freeze_tree_finish no longer freezing, done on " << *this << endl; + c->finish(-1); + delete c; + return; + } + // freezeable now? if (!is_freezeable()) { // wait again! @@ -1246,6 +1254,7 @@ void CDir::unfreeze_tree() state_clear(STATE_FREEZINGTREE); // cancel freeze waiters + finish_waiting(WAIT_UNFREEZE); finish_waiting(WAIT_FREEZEABLE, -1); } } diff --git a/branches/sage/pgs/mds/CInode.cc b/branches/sage/pgs/mds/CInode.cc index 97150a53d9166..0b614f7b1a3cc 100644 --- a/branches/sage/pgs/mds/CInode.cc +++ b/branches/sage/pgs/mds/CInode.cc @@ -61,17 +61,13 @@ ostream& operator<<(ostream& out, CInode& in) out << " v" << in.get_version(); - out << " auth=" << in.authlock; - out << " link=" << in.linklock; - out << " dft=" << in.dirfragtreelock; - out << " file=" << in.filelock; - out << " dir=" << in.dirlock; + // locks + out << " " << in.authlock; + out << " " << in.linklock; + out << " " << in.dirfragtreelock; + out << " " << in.filelock; + out << " " << in.dirlock; - if (in.get_num_ref()) { - out << " |"; - in.print_pin_set(out); - } - // hack: spit out crap on which clients have caps if (!in.get_client_caps().empty()) { out << " caps={"; @@ -83,6 +79,12 @@ ostream& operator<<(ostream& out, CInode& in) } out << "}"; } + + if (in.get_num_ref()) { + out << " |"; + in.print_pin_set(out); + } + out << " " << ∈ out << "]"; return out; diff --git a/branches/sage/pgs/mds/FileLock.h b/branches/sage/pgs/mds/FileLock.h index e0acc3df279b4..6c22631d5dac3 100644 --- a/branches/sage/pgs/mds/FileLock.h +++ b/branches/sage/pgs/mds/FileLock.h @@ -210,7 +210,7 @@ class FileLock : public SimpleLock { void print(ostream& out) { out << "("; - //out << get_lock_type_name(l.get_type()) << " "; + out << get_lock_type_name(get_type()) << " "; out << get_filelock_state_name(get_state()); if (!get_gather_set().empty()) out << " g=" << get_gather_set(); if (is_rdlocked()) diff --git a/branches/sage/pgs/mds/Locker.cc b/branches/sage/pgs/mds/Locker.cc index 741d89d43e28f..836b76ca96698 100644 --- a/branches/sage/pgs/mds/Locker.cc +++ b/branches/sage/pgs/mds/Locker.cc @@ -60,6 +60,7 @@ void Locker::dispatch(Message *m) { + switch (m->get_type()) { // locking @@ -89,16 +90,20 @@ void Locker::send_lock_message(SimpleLock *lock, int msg) for (map::iterator it = lock->get_parent()->replicas_begin(); it != lock->get_parent()->replicas_end(); it++) { + if (mds->mdsmap->get_state(it->first) < MDSMap::STATE_REJOIN) + continue; MLock *m = new MLock(lock, msg, mds->get_nodeid()); mds->send_message_mds(m, it->first, MDS_PORT_LOCKER); } } -void Locker::send_lock_message(SimpleLock *lock, int msg, bufferlist &data) +void Locker::send_lock_message(SimpleLock *lock, int msg, const bufferlist &data) { for (map::iterator it = lock->get_parent()->replicas_begin(); it != lock->get_parent()->replicas_end(); it++) { + if (mds->mdsmap->get_state(it->first) < MDSMap::STATE_REJOIN) + continue; MLock *m = new MLock(lock, msg, mds->get_nodeid()); m->set_data(data); mds->send_message_mds(m, it->first, MDS_PORT_LOCKER); @@ -499,9 +504,11 @@ void Locker::request_inode_file_caps(CInode *in) assert(!in->is_auth()); in->replica_caps_wanted = wanted; - mds->send_message_mds(new MInodeFileCaps(in->ino(), mds->get_nodeid(), - in->replica_caps_wanted), - auth, MDS_PORT_LOCKER); + + if (mds->mdsmap->get_state(auth) >= MDSMap::STATE_REJOIN) + mds->send_message_mds(new MInodeFileCaps(in->ino(), mds->get_nodeid(), + in->replica_caps_wanted), + auth, MDS_PORT_LOCKER); } else { in->replica_caps_wanted_keep_until.sec_ref() = 0; } @@ -509,18 +516,23 @@ void Locker::request_inode_file_caps(CInode *in) void Locker::handle_inode_file_caps(MInodeFileCaps *m) { + // nobody should be talking to us during recovery. + assert(mds->is_rejoin() || mds->is_active() || mds->is_stopping()); + + // ok CInode *in = mdcache->get_inode(m->get_ino()); assert(in); - assert(in->is_auth());// || in->is_proxy()); - - dout(7) << "handle_inode_file_caps replica mds" << m->get_from() << " wants caps " << cap_string(m->get_caps()) << " on " << *in << endl; + assert(in->is_auth()); - /*if (in->is_proxy()) { - dout(7) << "proxy, fw" << endl; - mds->send_message_mds(m, in->authority().first, MDS_PORT_LOCKER); + if (mds->is_rejoin() && + in->is_rejoining()) { + dout(7) << "handle_inode_file_caps still rejoining " << *in << ", dropping " << *m << endl; + delete m; return; } - */ + + + dout(7) << "handle_inode_file_caps replica mds" << m->get_from() << " wants caps " << cap_string(m->get_caps()) << " on " << *in << endl; if (m->get_caps()) in->mds_caps_wanted[m->get_from()] = m->get_caps(); @@ -703,6 +715,9 @@ ALSO: void Locker::handle_lock(MLock *m) { + // nobody should be talking to us during recovery. + assert(mds->is_rejoin() || mds->is_active() || mds->is_stopping()); + switch (m->get_otype()) { case LOCK_OTYPE_DN: { @@ -778,6 +793,15 @@ void Locker::handle_simple_lock(SimpleLock *lock, MLock *m) { int from = m->get_asker(); + if (mds->is_rejoin()) { + if (lock->get_parent()->is_rejoining()) { + dout(7) << "handle_simple_lock still rejoining " << *lock->get_parent() + << ", dropping " << *m << endl; + delete m; + return; + } + } + switch (m->get_action()) { // -- replica -- case LOCK_AC_SYNC: @@ -796,15 +820,12 @@ void Locker::handle_simple_lock(SimpleLock *lock, MLock *m) dout(7) << "handle_simple_lock has reader, waiting before ack on " << *lock << " on " << *lock->get_parent() << endl; lock->set_state(LOCK_GLOCKR); - lock->add_waiter(SimpleLock::WAIT_NOLOCKS, new C_MDS_RetryMessage(mds, m)); - return; + } else { + // update lock and reply + lock->set_state(LOCK_LOCK); + mds->send_message_mds(new MLock(lock, LOCK_AC_LOCKACK, mds->get_nodeid()), + from, MDS_PORT_LOCKER); } - - // update lock and reply - lock->set_state(LOCK_LOCK); - - mds->send_message_mds(new MLock(lock, LOCK_AC_LOCKACK, mds->get_nodeid()), - from, MDS_PORT_LOCKER); break; @@ -815,6 +836,7 @@ void Locker::handle_simple_lock(SimpleLock *lock, MLock *m) MDRequest *mdr = mdcache->request_get(m->get_reqid()); mdr->xlocks.insert(lock); mdr->locks.insert(lock); + lock->set_state(LOCK_REMOTEXLOCK); lock->finish_waiters(SimpleLock::WAIT_REMOTEXLOCK); } break; @@ -879,38 +901,66 @@ void Locker::handle_simple_lock(SimpleLock *lock, MLock *m) } +class C_Locker_SimpleEval : public Context { + Locker *locker; + SimpleLock *lock; +public: + C_Locker_SimpleEval(Locker *l, SimpleLock *lk) : locker(l), lock(lk) {} + void finish(int r) { + locker->simple_eval(lock); + } +}; + void Locker::simple_eval(SimpleLock *lock) { - // finished gather? - if (lock->get_parent()->is_auth() && - !lock->is_stable() && - !lock->is_gathering()) { + // unstable and ambiguous auth? + if (!lock->is_stable() && + lock->get_parent()->is_ambiguous_auth()) { + dout(7) << "simple_eval not stable and ambiguous auth, waiting on " << *lock->get_parent() << endl; + //if (!lock->get_parent()->is_waiter(MDSCacheObject::WAIT_SINGLEAUTH)) + lock->get_parent()->add_waiter(MDSCacheObject::WAIT_SINGLEAUTH, new C_Locker_SimpleEval(this, lock)); + return; + } + + // finished remote xlock? + if (lock->get_state() == LOCK_REMOTEXLOCK && + !lock->is_xlocked()) { + // tell auth + assert(!lock->get_parent()->is_auth()); // should be auth_pinned on the auth + dout(7) << "simple_eval releasing remote xlock on " << *lock->get_parent() << endl; + int auth = lock->get_parent()->authority().first; + if (mds->mdsmap->get_state(auth) >= MDSMap::STATE_REJOIN) + mds->send_message_mds(new MLock(lock, LOCK_AC_UNXLOCK, mds->get_nodeid()), + auth, MDS_PORT_LOCKER); + lock->set_state(LOCK_LOCK); + } + + // finished gathering? + if (lock->get_state() == LOCK_GLOCKR && + !lock->is_gathering() && + !lock->is_rdlocked()) { dout(7) << "simple_eval finished gather on " << *lock << " on " << *lock->get_parent() << endl; - switch (lock->get_state()) { - case LOCK_GLOCKR: - lock->set_state(LOCK_LOCK); - lock->finish_waiters(SimpleLock::WAIT_STABLE); - break; - - default: - assert(0); + + // replica: tell auth + if (!lock->get_parent()->is_auth()) { + int auth = lock->get_parent()->authority().first; + if (mds->mdsmap->get_state(auth) >= MDSMap::STATE_REJOIN) + mds->send_message_mds(new MLock(lock, LOCK_AC_LOCKACK, mds->get_nodeid()), + lock->get_parent()->authority().first, MDS_PORT_LOCKER); } - } - if (!lock->is_stable()) return; - - if (lock->get_parent()->is_auth()) { - // sync? - if (lock->get_state() != LOCK_SYNC && - lock->get_parent()->is_replicated() && - !lock->is_waiter_for(SimpleLock::WAIT_WR)) { - dout(7) << "simple_eval stable, syncing " << *lock - << " on " << *lock->get_parent() << endl; - simple_sync(lock); - } + lock->set_state(LOCK_LOCK); + lock->finish_waiters(SimpleLock::WAIT_STABLE|SimpleLock::WAIT_WR); + } - } else { - // replica + // stable -> sync? + if (lock->get_parent()->is_auth() && + lock->is_stable() && + lock->get_state() != LOCK_SYNC && + !lock->is_waiter_for(SimpleLock::WAIT_WR)) { + dout(7) << "simple_eval stable, syncing " << *lock + << " on " << *lock->get_parent() << endl; + simple_sync(lock); } } @@ -929,13 +979,16 @@ void Locker::simple_sync(SimpleLock *lock) if (lock->get_state() == LOCK_GLOCKR) assert(0); // um... hmm! assert(lock->get_state() == LOCK_LOCK); - - // hard data - bufferlist data; - lock->encode_locked_state(data); - - // bcast to replicas - send_lock_message(lock, LOCK_AC_SYNC, data); + + // sync. + if (lock->get_parent()->is_replicated()) { + // hard data + bufferlist data; + lock->encode_locked_state(data); + + // bcast to replicas + send_lock_message(lock, LOCK_AC_SYNC, data); + } // change lock lock->set_state(LOCK_SYNC); @@ -1015,11 +1068,9 @@ void Locker::simple_rdlock_finish(SimpleLock *lock, MDRequest *mdr) dout(7) << "simple_rdlock_finish on " << *lock << " on " << *lock->get_parent() << endl; - if (lock->get_state() == LOCK_GLOCKR && - !lock->is_rdlocked()) { - lock->set_state(LOCK_SYNC); // return state to sync, in case the unpinner flails - lock->finish_waiters(SimpleLock::WAIT_NOLOCKS); - } + // last one? + if (!lock->is_rdlocked()) + simple_eval(lock); } bool Locker::simple_xlock_start(SimpleLock *lock, MDRequest *mdr) @@ -1066,6 +1117,7 @@ bool Locker::simple_xlock_start(SimpleLock *lock, MDRequest *mdr) new C_MDS_RetryRequest(mdcache, mdr)); return false; } + int auth = lock->get_parent()->authority().first; // wait for sync. // (???????????) @@ -1075,7 +1127,6 @@ bool Locker::simple_xlock_start(SimpleLock *lock, MDRequest *mdr) } // send lock request - int auth = lock->get_parent()->authority().first; MLock *m = new MLock(lock, LOCK_AC_REQXLOCK, mds->get_nodeid()); mds->send_message_mds(m, auth, MDS_PORT_LOCKER); @@ -1091,29 +1142,16 @@ void Locker::simple_xlock_finish(SimpleLock *lock, MDRequest *mdr) // drop ref assert(lock->can_xlock(mdr)); lock->put_xlock(); + assert(mdr); mdr->xlocks.erase(lock); mdr->locks.erase(lock); dout(7) << "simple_xlock_finish on " << *lock << " on " << *lock->get_parent() << endl; - // slave? - if (!lock->get_parent()->is_auth()) { - mds->send_message_mds(new MLock(lock, LOCK_AC_UNXLOCK, mds->get_nodeid()), - lock->get_parent()->authority().first, MDS_PORT_LOCKER); - } - // others waiting? - if (lock->is_waiter_for(SimpleLock::WAIT_WR)) { - // wake 'em up - lock->finish_waiters(SimpleLock::WAIT_WR, 0); - } else { - // auto-sync if alone. - if (lock->get_parent()->is_auth() && - !lock->get_parent()->is_replicated() && - lock->get_state() != LOCK_SYNC) - lock->set_state(LOCK_SYNC); - - simple_eval(lock); - } + lock->finish_waiters(SimpleLock::WAIT_WR, 0); + + // eval + simple_eval(lock); } @@ -1261,19 +1299,43 @@ void Locker::scatter_wrlock_finish(ScatterLock *lock, MDRequest *mdr) scatter_eval(lock); } + +class C_Locker_ScatterEval : public Context { + Locker *locker; + ScatterLock *lock; +public: + C_Locker_ScatterEval(Locker *l, ScatterLock *lk) : locker(l), lock(lk) {} + void finish(int r) { + locker->scatter_eval(lock); + } +}; + void Locker::scatter_eval(ScatterLock *lock) { + // unstable and ambiguous auth? + if (!lock->is_stable() && + lock->get_parent()->is_ambiguous_auth()) { + dout(7) << "scatter_eval not stable and ambiguous auth, waiting on " << *lock->get_parent() << endl; + //if (!lock->get_parent()->is_waiter(MDSCacheObject::WAIT_SINGLEAUTH)) + lock->get_parent()->add_waiter(MDSCacheObject::WAIT_SINGLEAUTH, new C_Locker_ScatterEval(this, lock)); + return; + } + if (!lock->get_parent()->is_auth()) { // REPLICA if (lock->get_state() == LOCK_GSYNCS && !lock->is_wrlocked()) { dout(10) << "scatter_eval no wrlocks, acking sync" << endl; - bufferlist data; - lock->encode_locked_state(data); - mds->send_message_mds(new MLock(lock, LOCK_AC_SYNCACK, mds->get_nodeid(), data), - lock->get_parent()->authority().first, MDS_PORT_LOCKER); + int auth = lock->get_parent()->authority().first; + if (mds->mdsmap->get_state(auth) >= MDSMap::STATE_REJOIN) { + bufferlist data; + lock->encode_locked_state(data); + mds->send_message_mds(new MLock(lock, LOCK_AC_SYNCACK, mds->get_nodeid(), data), + auth, MDS_PORT_LOCKER); + } lock->set_state(LOCK_SYNC); + lock->finish_waiters(ScatterLock::WAIT_STABLE); // ? } } else { @@ -1286,7 +1348,7 @@ void Locker::scatter_eval(ScatterLock *lock) dout(7) << "scatter_eval finished gather/un-wrlock on " << *lock << " on " << *lock->get_parent() << endl; lock->set_state(LOCK_SYNC); - lock->finish_waiters(SimpleLock::WAIT_STABLE|SimpleLock::WAIT_RD|SimpleLock::WAIT_NOLOCKS); + lock->finish_waiters(ScatterLock::WAIT_STABLE|ScatterLock::WAIT_RD); } // gscatters -> scatter? @@ -1301,13 +1363,15 @@ void Locker::scatter_eval(ScatterLock *lock) } lock->set_state(LOCK_SCATTER); - lock->finish_waiters(SimpleLock::WAIT_WR|SimpleLock::WAIT_STABLE); + lock->get_wrlock(); + lock->finish_waiters(ScatterLock::WAIT_WR|ScatterLock::WAIT_STABLE); + lock->put_wrlock(); } // waiting for rd? if (lock->get_state() == LOCK_SCATTER && !lock->is_wrlocked() && - lock->is_waiter_for(SimpleLock::WAIT_RD)) { + lock->is_waiter_for(ScatterLock::WAIT_RD)) { dout(10) << "scatter_eval no wrlocks, read waiter, syncing" << endl; scatter_sync(lock); } @@ -1339,9 +1403,10 @@ void Locker::scatter_sync(ScatterLock *lock) } else if (lock->is_wrlocked()) { lock->set_state(LOCK_GSYNCS); - } else { + } + else { lock->set_state(LOCK_SYNC); - lock->finish_waiters(SimpleLock::WAIT_RD|SimpleLock::WAIT_STABLE); + lock->finish_waiters(ScatterLock::WAIT_RD|ScatterLock::WAIT_STABLE); } } @@ -1365,7 +1430,7 @@ void Locker::scatter_scatter(ScatterLock *lock) send_lock_message(lock, LOCK_AC_SCATTER, data); } lock->set_state(LOCK_SCATTER); - lock->finish_waiters(SimpleLock::WAIT_WR|SimpleLock::WAIT_STABLE); + lock->finish_waiters(ScatterLock::WAIT_WR|ScatterLock::WAIT_STABLE); } } @@ -1375,6 +1440,15 @@ void Locker::handle_scatter_lock(ScatterLock *lock, MLock *m) { int from = m->get_asker(); + if (mds->is_rejoin()) { + if (lock->get_parent()->is_rejoining()) { + dout(7) << "handle_scatter_lock still rejoining " << *lock->get_parent() + << ", dropping " << *m << endl; + delete m; + return; + } + } + switch (m->get_action()) { // -- replica -- case LOCK_AC_SYNC: @@ -1398,7 +1472,7 @@ void Locker::handle_scatter_lock(ScatterLock *lock, MLock *m) assert(lock->get_state() == LOCK_SYNC); lock->decode_locked_state(m->get_data()); lock->set_state(LOCK_SCATTER); - lock->finish_waiters(SimpleLock::WAIT_WR|SimpleLock::WAIT_STABLE); + lock->finish_waiters(ScatterLock::WAIT_WR|ScatterLock::WAIT_STABLE); break; // -- for auth -- @@ -1416,7 +1490,7 @@ void Locker::handle_scatter_lock(ScatterLock *lock, MLock *m) dout(7) << "handle_scatter_lock " << *lock << " on " << *lock->get_parent() << " from " << from << ", last one" << endl; - simple_eval(lock); + scatter_eval(lock); } break; } @@ -1508,10 +1582,8 @@ void Locker::file_rdlock_finish(FileLock *lock, MDRequest *mdr) dout(7) << "rdlock_finish on " << *lock << " on " << *lock->get_parent() << endl; - if (!lock->is_rdlocked()) { - lock->finish_waiters(SimpleLock::WAIT_NOLOCKS); + if (!lock->is_rdlocked()) file_eval(lock); - } } @@ -1568,10 +1640,13 @@ void Locker::file_xlock_finish(FileLock *lock, MDRequest *mdr) dout(7) << "file_xlock_finish on " << *lock << " on " << *lock->get_parent() << endl; assert(lock->get_parent()->is_auth()); // or implement remote xlocks - - // drop lock? - if (!lock->is_waiter_for(SimpleLock::WAIT_STABLE)) - file_eval(lock); + + // others waiting? + lock->finish_waiters(SimpleLock::WAIT_WR, 0); + + //// drop lock? + //if (!lock->is_waiter_for(SimpleLock::WAIT_STABLE)) + file_eval(lock); } @@ -1582,11 +1657,31 @@ void Locker::file_xlock_finish(FileLock *lock, MDRequest *mdr) * - checks if we're in unstable sfot state and can now move on to next state * - checks if soft state should change (eg bc last writer closed) */ +class C_Locker_FileEval : public Context { + Locker *locker; + FileLock *lock; +public: + C_Locker_FileEval(Locker *l, FileLock *lk) : locker(l), lock(lk) {} + void finish(int r) { + locker->file_eval(lock); + } +}; + void Locker::file_eval(FileLock *lock) { CInode *in = (CInode*)lock->get_parent(); + // unstable and ambiguous auth? + if (!lock->is_stable() && + in->is_ambiguous_auth()) { + dout(7) << "file_eval not stable and ambiguous auth, waiting on " << *in << endl; + //if (!lock->get_parent()->is_waiter(MDSCacheObject::WAIT_SINGLEAUTH)) + in->add_waiter(CInode::WAIT_SINGLEAUTH, new C_Locker_FileEval(this, lock)); + return; + } + + int issued = in->get_caps_issued(); // [auth] finished gather? @@ -1605,7 +1700,7 @@ void Locker::file_eval(FileLock *lock) // waiters lock->get_rdlock(); - lock->finish_waiters(SimpleLock::WAIT_STABLE); + lock->finish_waiters(SimpleLock::WAIT_STABLE|SimpleLock::WAIT_WR|SimpleLock::WAIT_RD); lock->put_rdlock(); } break; @@ -1789,19 +1884,15 @@ bool Locker::file_sync(FileLock *lock) if (lock->get_state() == LOCK_LOCK) { if (in->is_replicated()) { - // soft data bufferlist softdata; lock->encode_locked_state(softdata); - - // bcast to replicas send_lock_message(lock, LOCK_AC_SYNC, softdata); } - + // change lock lock->set_state(LOCK_SYNC); - // reissue caps - issue_caps(in); + issue_caps(in); // reissue caps return true; } @@ -1813,10 +1904,10 @@ bool Locker::file_sync(FileLock *lock) issue_caps(in); } else { // no writers, go straight to sync - if (in->is_replicated()) { - // bcast to replicas - send_lock_message(lock, LOCK_AC_SYNC); + bufferlist softdata; + lock->encode_locked_state(softdata); + send_lock_message(lock, LOCK_AC_SYNC, softdata); } // change lock @@ -1834,8 +1925,9 @@ bool Locker::file_sync(FileLock *lock) } else { // no writers, go straight to sync if (in->is_replicated()) { - // bcast to replicas - send_lock_message(lock, LOCK_AC_SYNC); + bufferlist softdata; + lock->encode_locked_state(softdata); + send_lock_message(lock, LOCK_AC_SYNC, softdata); } // change lock @@ -2070,6 +2162,16 @@ void Locker::handle_file_lock(FileLock *lock, MLock *m) CInode *in = (CInode*)lock->get_parent(); int from = m->get_asker(); + if (mds->is_rejoin()) { + if (in->is_rejoining()) { + dout(7) << "handle_file_lock still rejoining " << *in + << ", dropping " << *m << endl; + delete m; + return; + } + } + + dout(7) << "handle_file_lock a=" << m->get_action() << " from " << from << " " << *in << " filelock=" << *lock << endl; @@ -2104,16 +2206,15 @@ void Locker::handle_file_lock(FileLock *lock, MLock *m) } if (lock->is_rdlocked()) { dout(7) << "handle_file_lock rdlocked, waiting before ack on " << *in << endl; - in->add_waiter(SimpleLock::WAIT_NOLOCKS, new C_MDS_RetryMessage(mds, m)); lock->set_state(LOCK_GLOCKR); - assert(0);// i am broken.. why retry message when state captures all the info i need? - return; + break; } if (issued & CAP_FILE_RD) { + dout(7) << "handle_file_lock RD cap issued, waiting before ack on " << *in << endl; lock->set_state(LOCK_GLOCKR); break; } - + // nothing to wait for, lock and ack. { lock->set_state(LOCK_LOCK); diff --git a/branches/sage/pgs/mds/Locker.h b/branches/sage/pgs/mds/Locker.h index 6120c5c246683..5f79bbacf4de0 100644 --- a/branches/sage/pgs/mds/Locker.h +++ b/branches/sage/pgs/mds/Locker.h @@ -60,7 +60,7 @@ private: void handle_lock(MLock *m); void send_lock_message(SimpleLock *lock, int msg); - void send_lock_message(SimpleLock *lock, int msg, bufferlist &data); + void send_lock_message(SimpleLock *lock, int msg, const bufferlist &data); // -- locks -- bool acquire_locks(MDRequest *mdr, diff --git a/branches/sage/pgs/mds/MDBalancer.cc b/branches/sage/pgs/mds/MDBalancer.cc index 12ccc87147fef..75e8872a4dc0a 100644 --- a/branches/sage/pgs/mds/MDBalancer.cc +++ b/branches/sage/pgs/mds/MDBalancer.cc @@ -706,13 +706,22 @@ void MDBalancer::find_exports(CDir *dir, void MDBalancer::hit_inode(CInode *in, int type) { // hit me - in->popularity[MDS_POP_JUSTME].pop[type].hit(); - in->popularity[MDS_POP_NESTED].pop[type].hit(); + float me = in->popularity[MDS_POP_JUSTME].pop[type].hit(); + float nested = in->popularity[MDS_POP_NESTED].pop[type].hit(); + float curdom = 0; + float anydom = 0; if (in->is_auth()) { - in->popularity[MDS_POP_CURDOM].pop[type].hit(); - in->popularity[MDS_POP_ANYDOM].pop[type].hit(); + curdom = in->popularity[MDS_POP_CURDOM].pop[type].hit(); + anydom = in->popularity[MDS_POP_ANYDOM].pop[type].hit(); } - + + dout(20) << "hit_inode " << type << " pop " << me << " me, " + << nested << " nested, " + << curdom << " curdom, " + << anydom << " anydom" + << " on " << *in + << endl; + // hit auth up to import CDir *dir = in->get_parent_dir(); if (dir) hit_dir(dir, type); @@ -728,7 +737,8 @@ void MDBalancer::hit_dir(CDir *dir, int type) if (g_conf.num_mds > 2 && // FIXME >2 thing !dir->inode->is_root() && // not root (for now at least) dir->is_auth()) { - //dout(-20) << "hit_dir " << type << " pop is " << v << " " << *dir << endl; + dout(20) << "hit_dir " << type << " pop " << v << " me " + << *dir << endl; // hash this dir? (later?) if (((v > g_conf.mds_bal_hash_rd && type == META_POP_IRD) || @@ -756,6 +766,8 @@ void MDBalancer::hit_recursive(CDir *dir, int type) // replicate? float dir_pop = dir->popularity[MDS_POP_CURDOM].pop[type].get(); // hmm?? + dout(20) << "hit_recursive " << type << " pop " << dir_pop << " curdom " << *dir << endl; + if (dir->is_auth()) { if (!dir->is_rep() && dir_pop >= g_conf.mds_bal_replicate_threshold) { @@ -764,7 +776,7 @@ void MDBalancer::hit_recursive(CDir *dir, int type) rd_adj = rdp / mds->get_mds_map()->get_num_mds() - rdp; rd_adj /= 2.0; // temper somewhat - dout(1) << "replicating dir " << *dir << " pop " << dir_pop << " .. rdp " << rdp << " adj " << rd_adj << endl; + dout(2) << "replicating dir " << *dir << " pop " << dir_pop << " .. rdp " << rdp << " adj " << rd_adj << endl; dir->dir_rep = CDir::REP_ALL; mds->mdcache->send_dir_updates(dir, true); @@ -777,7 +789,7 @@ void MDBalancer::hit_recursive(CDir *dir, int type) dir->is_rep() && dir_pop < g_conf.mds_bal_unreplicate_threshold) { // unreplicate - dout(1) << "unreplicating dir " << *dir << " pop " << dir_pop << endl; + dout(2) << "unreplicating dir " << *dir << " pop " << dir_pop << endl; dir->dir_rep = CDir::REP_NONE; mds->mdcache->send_dir_updates(dir); diff --git a/branches/sage/pgs/mds/MDCache.cc b/branches/sage/pgs/mds/MDCache.cc index 5bafe2cacb688..a352fc4351df0 100644 --- a/branches/sage/pgs/mds/MDCache.cc +++ b/branches/sage/pgs/mds/MDCache.cc @@ -42,6 +42,7 @@ #include "events/ESlaveUpdate.h" #include "events/EString.h" #include "events/EPurgeFinish.h" +#include "events/EImportFinish.h" #include "messages/MGenericMessage.h" @@ -1200,9 +1201,11 @@ void MDCache::disambiguate_imports() if (dir->authority().first != CDIR_AUTH_UNKNOWN) { dout(10) << "ambiguous import auth known, must not be me " << *dir << endl; cancel_ambiguous_import(q->first); + mds->mdlog->submit_entry(new EImportFinish(dir, false)); } else { dout(10) << "ambiguous import auth unknown, must be me " << *dir << endl; finish_ambiguous_import(q->first); + mds->mdlog->submit_entry(new EImportFinish(dir, true)); } } assert(my_ambiguous_imports.empty()); @@ -1262,50 +1265,71 @@ void MDCache::finish_ambiguous_import(dirfrag_t df) } -/* +/** recalc_auth_bits() * once subtree auth is disambiguated, we need to adjust all the - * auth (and dirty) bits in our cache before moving on. + * auth and dirty bits in our cache before moving on. */ void MDCache::recalc_auth_bits() { dout(7) << "recalc_auth_bits" << endl; - for (hash_map::iterator p = inode_map.begin(); - p != inode_map.end(); + for (map >::iterator p = subtrees.begin(); + p != subtrees.end(); ++p) { - CInode *in = p->second; - if (in->authority().first == mds->get_nodeid()) - in->state_set(CInode::STATE_AUTH); - else { - in->state_clear(CInode::STATE_AUTH); - if (in->is_dirty()) - in->mark_clean(); - } + list dfq; // dirfrag queue + dfq.push_back(p->first); - if (in->parent) { - if (in->parent->authority().first == mds->get_nodeid()) - in->parent->state_set(CDentry::STATE_AUTH); - else { - in->parent->state_clear(CDentry::STATE_AUTH); - if (in->parent->is_dirty()) - in->parent->mark_clean(); - } - } + bool auth = p->first->authority().first == mds->get_nodeid(); + dout(10) << " subtree auth=" << auth << " for " << *p->first << endl; - list ls; - for (list::iterator p = ls.begin(); - p != ls.end(); - ++p) { - CDir *dir = *p; - if (dir->authority().first == mds->get_nodeid()) + while (!dfq.empty()) { + CDir *dir = dfq.front(); + dfq.pop_front(); + + // dir + if (auth) dir->state_set(CDir::STATE_AUTH); else { + dir->state_set(CDir::STATE_REJOINING); dir->state_clear(CDir::STATE_AUTH); if (dir->is_dirty()) dir->mark_clean(); } + + // dentries in this dir + for (map::iterator q = dir->items.begin(); + q != dir->items.end(); + ++q) { + // dn + CDentry *dn = q->second; + if (auth) + dn->state_set(CDentry::STATE_AUTH); + else { + dn->state_set(CDentry::STATE_REJOINING); + dn->state_clear(CDentry::STATE_AUTH); + if (dn->is_dirty()) + dn->mark_clean(); + } + + if (dn->is_primary()) { + // inode + if (auth) + dn->inode->state_set(CInode::STATE_AUTH); + else { + dn->inode->state_set(CInode::STATE_REJOINING); + dn->inode->state_clear(CInode::STATE_AUTH); + if (dn->inode->is_dirty()) + dn->inode->mark_clean(); + } + + // recurse? + if (dn->inode->is_dir()) + dn->inode->get_nested_dirfrags(dfq); + } + } } } + show_subtrees(); show_cache(); } @@ -1410,7 +1434,8 @@ void MDCache::cache_rejoin_walk(CDir *dir, MMDSCacheRejoin *rejoin) in->authlock.get_state(), in->linklock.get_state(), in->dirfragtreelock.get_state(), - in->filelock.get_state()); + in->filelock.get_state(), + in->dirlock.get_state()); if (in->authlock.is_xlocked()) rejoin->add_inode_xlock(in->ino(), in->authlock.get_type(), in->authlock.get_xlocked_by()->reqid); @@ -1489,7 +1514,7 @@ void MDCache::handle_cache_rejoin_rejoin(MMDSCacheRejoin *m) if (mds->is_active() || mds->is_stopping()) { dout(10) << "i am active. removing stale cache replicas" << endl; - // first, scour cache of replica references + // first, scour cache of unmentioned replica references for (hash_map::iterator p = inode_map.begin(); p != inode_map.end(); ++p) { @@ -1548,7 +1573,8 @@ void MDCache::handle_cache_rejoin_rejoin(MMDSCacheRejoin *m) if (dn) { int nonce = dn->add_replica(from); dout(10) << " have " << *dn << endl; - ack->add_strong_dentry(*p, *q, dn->lock.get_state(), nonce); + if (ack) + ack->add_strong_dentry(*p, *q, dn->lock.get_state(), nonce); } else { dout(10) << " missing " << *p << " " << *q << endl; if (!missing) missing = new MMDSCacheRejoin(MMDSCacheRejoin::OP_MISSING); @@ -1578,19 +1604,22 @@ void MDCache::handle_cache_rejoin_rejoin(MMDSCacheRejoin *m) if (in) { int nonce = in->add_replica(from); in->mds_caps_wanted.erase(from); - in->authlock.remove_gather(from); // just in case - in->linklock.remove_gather(from); // just in case - in->dirfragtreelock.remove_gather(from); // just in case - in->filelock.remove_gather(from); // just in case dout(10) << " have (weak) " << *in << endl; - if (ack) + if (ack) { + in->authlock.remove_gather(from); + in->linklock.remove_gather(from); + in->dirfragtreelock.remove_gather(from); + in->filelock.remove_gather(from); + in->dirlock.remove_gather(from); ack->add_strong_inode(in->ino(), nonce, 0, in->authlock.get_replica_state(), in->linklock.get_replica_state(), in->dirfragtreelock.get_replica_state(), - in->filelock.get_replica_state()); + in->filelock.get_replica_state(), + in->dirlock.get_replica_state()); + } } else { dout(10) << " missing " << *p << endl; if (!missing) missing = new MMDSCacheRejoin(MMDSCacheRejoin::OP_MISSING); @@ -1609,23 +1638,34 @@ void MDCache::handle_cache_rejoin_rejoin(MMDSCacheRejoin *m) in->mds_caps_wanted[from] = p->second.caps_wanted; else in->mds_caps_wanted.erase(from); - in->authlock.remove_gather(from); // just in case - in->linklock.remove_gather(from); // just in case - in->dirfragtreelock.remove_gather(from); // just in case - in->filelock.remove_gather(from); // just in case dout(10) << " have (strong) " << *in << endl; if (ack) { + // i had inode, just tell replica the correct state + in->authlock.remove_gather(from); + in->linklock.remove_gather(from); + in->dirfragtreelock.remove_gather(from); + in->filelock.remove_gather(from); + in->dirlock.remove_gather(from); ack->add_strong_inode(in->ino(), nonce, 0, in->authlock.get_replica_state(), in->linklock.get_replica_state(), in->dirfragtreelock.get_replica_state(), - in->filelock.get_replica_state()); + in->filelock.get_replica_state(), + in->dirlock.get_replica_state()); } else { - // note strong replica filelock state requests - //if (p->second.filelock & CAP_FILE_RD) - //filelock_replica_readers.insert(in); + // take note of replica state values. + // SimpleLock -- + // we can ignore; locked replicas can be safely changed to sync. + // FileLock -- + // we can also ignore. + // replicas will at most issue RDCACHE|RD, which is covered by the default SYNC, + // so only _locally_ opened files are significant. + // ScatterLock -- adjust accordingly + if (p->second.dirlock == LOCK_SCATTER || + p->second.dirlock == LOCK_GSCATTERS) // replica still has rdlocks + in->dirlock.set_state(LOCK_SCATTER); } } else { dout(10) << " missing " << p->first << endl; @@ -1711,6 +1751,7 @@ void MDCache::handle_cache_rejoin_ack(MMDSCacheRejoin *m) assert(dir); dir->set_replica_nonce(p->second.nonce); + dir->state_clear(CDir::STATE_REJOINING); dout(10) << " got " << *dir << endl; // dentries @@ -1721,6 +1762,7 @@ void MDCache::handle_cache_rejoin_ack(MMDSCacheRejoin *m) assert(dn); dn->set_replica_nonce(q->second.nonce); dn->lock.set_state(q->second.lock); + dn->state_clear(CDentry::STATE_REJOINING); dout(10) << " got " << *dn << endl; } } @@ -1736,6 +1778,8 @@ void MDCache::handle_cache_rejoin_ack(MMDSCacheRejoin *m) in->linklock.set_state(p->second.linklock); in->dirfragtreelock.set_state(p->second.dirfragtreelock); in->filelock.set_state(p->second.filelock); + in->dirlock.set_state(p->second.dirlock); + in->state_clear(CInode::STATE_REJOINING); dout(10) << " got " << *in << endl; } @@ -1767,7 +1811,11 @@ void MDCache::handle_cache_rejoin_missing(MMDSCacheRejoin *m) p != m->weak_dirfrags.end(); ++p) { CDir *dir = get_dirfrag(*p); - assert(dir); + if (!dir) { + dout(10) << " don't have dirfrag " << *p << endl; + continue; // we must have trimmed it after the original rejoin + } + dout(10) << " sending " << *dir << endl; // dentries @@ -1775,7 +1823,10 @@ void MDCache::handle_cache_rejoin_missing(MMDSCacheRejoin *m) q != m->weak_dentries[*p].end(); ++q) { CDentry *dn = dir->lookup(*q); - assert(dn); + if (!dn) { + dout(10) << " don't have dentry " << *q << " in " << *dir << endl; + continue; // we must have trimmed it after our original rejoin + } dout(10) << " sending " << *dn << endl; if (mds->is_rejoin()) full->add_weak_dentry(*p, *q); @@ -1789,7 +1840,10 @@ void MDCache::handle_cache_rejoin_missing(MMDSCacheRejoin *m) p != m->weak_inodes.end(); ++p) { CInode *in = get_inode(*p); - assert(in); + if (!in) { + dout(10) << " don't have inode " << *p << endl; + continue; // we must have trimmed it after the originalo rejoin + } dout(10) << " sending " << *in << endl; full->add_full_inode(in->inode, in->symlink, in->dirfragtree); @@ -1802,7 +1856,8 @@ void MDCache::handle_cache_rejoin_missing(MMDSCacheRejoin *m) in->authlock.get_replica_state(), in->linklock.get_replica_state(), in->dirfragtreelock.get_replica_state(), - in->filelock.get_replica_state()); + in->filelock.get_replica_state(), + in->dirlock.get_replica_state()); } mds->send_message_mds(full, m->get_source().num(), MDS_PORT_CACHE); @@ -1810,7 +1865,13 @@ void MDCache::handle_cache_rejoin_missing(MMDSCacheRejoin *m) void MDCache::handle_cache_rejoin_full(MMDSCacheRejoin *m) { + dout(7) << "handle_cache_rejoin_full from " << m->get_source() << endl; + + assert(0); // write me + + + delete m; } void MDCache::send_cache_rejoin_acks() @@ -1893,7 +1954,8 @@ void MDCache::send_cache_rejoin_acks() in->authlock.get_replica_state(), in->linklock.get_replica_state(), in->dirfragtreelock.get_replica_state(), - in->filelock.get_replica_state()); + in->filelock.get_replica_state(), + in->dirlock.get_replica_state()); } // subdirs in this subtree? @@ -1914,6 +1976,7 @@ void MDCache::send_cache_rejoin_acks() // =============================================================================== +/* void MDCache::rename_file(CDentry *srcdn, CDentry *destdn) { @@ -1928,7 +1991,7 @@ void MDCache::rename_file(CDentry *srcdn, // link inode w/ dentry destdn->dir->link_inode( destdn, in ); } - +*/ void MDCache::set_root(CInode *in) @@ -4647,9 +4710,9 @@ void MDCache::show_cache() p != dir->items.end(); ++p) { CDentry *dn = p->second; - dout(7) << " dentry " << *dn << endl; + dout(7) << " dentry " << *dn << endl; if (dn->is_primary() && dn->inode) - dout(7) << " inode " << *dn->inode << endl; + dout(7) << " inode " << *dn->inode << endl; } } } diff --git a/branches/sage/pgs/mds/MDS.cc b/branches/sage/pgs/mds/MDS.cc index e0043087be590..e4b0d21d6959b 100644 --- a/branches/sage/pgs/mds/MDS.cc +++ b/branches/sage/pgs/mds/MDS.cc @@ -441,6 +441,7 @@ void MDS::beacon_kill(utime_t lab) void MDS::handle_mds_map(MMDSMap *m) { + version_t hadepoch = mdsmap->get_epoch(); version_t epoch = m->get_epoch(); dout(5) << "handle_mds_map epoch " << epoch << " from " << m->get_source() << endl; @@ -671,6 +672,12 @@ void MDS::handle_mds_map(MMDSMap *m) } */ + // just got mdsmap+osdmap? + if (hadepoch == 0 && + mdsmap->get_epoch() > 0 && + osdmap->get_epoch() > 0) + boot(); + delete m; } @@ -691,24 +698,30 @@ void MDS::bcast_mds_map() void MDS::handle_osd_map(MOSDMap *m) { - version_t had = osdmap->get_epoch(); + version_t hadepoch = osdmap->get_epoch(); + dout(10) << "handle_osd_map had " << hadepoch << endl; - dout(10) << "handle_osd_map had " << had << endl; - - // process locally + // process objecter->handle_osd_map(m); - if (had == 0 && osdmap->get_epoch() > 0) { - if (is_creating()) - boot_create(); // new tables, journal - else if (is_starting()) - boot_start(); // old tables, empty journal - else if (is_replay()) - boot_replay(); // replay, join - else - assert(is_standby()); - } - + // just got mdsmap+osdmap? + if (hadepoch == 0 && + osdmap->get_epoch() > 0 && + mdsmap->get_epoch() > 0) + boot(); +} + + +void MDS::boot() +{ + if (is_creating()) + boot_create(); // new tables, journal + else if (is_starting()) + boot_start(); // old tables, empty journal + else if (is_replay()) + boot_replay(); // replay, join + else + assert(0); } @@ -1050,6 +1063,7 @@ void MDS::my_dispatch(Message *m) // hack: thrash exports for (int i=0; i s; + if (!is_active()) break; mdsmap->get_mds_set(s, MDSMap::STATE_ACTIVE); if (s.size() < 2 || mdcache->get_num_inodes() < 10) break; // need peers for this to work. diff --git a/branches/sage/pgs/mds/MDS.h b/branches/sage/pgs/mds/MDS.h index dde751472f9bb..ae00ea17df3fc 100644 --- a/branches/sage/pgs/mds/MDS.h +++ b/branches/sage/pgs/mds/MDS.h @@ -189,6 +189,7 @@ class MDS : public Dispatcher { int init(bool standby=false); void reopen_logger(); + void boot(); void boot_create(); // i am new mds. void boot_start(); // i am old but empty (was down:out) mds. void boot_replay(int step=0); // i am recovering existing (down:failed) mds. diff --git a/branches/sage/pgs/mds/ScatterLock.h b/branches/sage/pgs/mds/ScatterLock.h index 564bc9155a6e3..08a57d8ea67b3 100644 --- a/branches/sage/pgs/mds/ScatterLock.h +++ b/branches/sage/pgs/mds/ScatterLock.h @@ -84,7 +84,7 @@ public: void print(ostream& out) { out << "("; - //out << get_lock_type_name(l.get_type()) << " "; + out << get_lock_type_name(get_type()) << " "; out << get_scatterlock_state_name(get_state()); if (!get_gather_set().empty()) out << " g=" << get_gather_set(); if (is_rdlocked()) diff --git a/branches/sage/pgs/mds/Server.cc b/branches/sage/pgs/mds/Server.cc index 3435f3cbe491c..09d7e54552243 100644 --- a/branches/sage/pgs/mds/Server.cc +++ b/branches/sage/pgs/mds/Server.cc @@ -891,6 +891,54 @@ CDir* Server::try_open_dir(CInode *diri, frag_t fg, MDRequest *mdr) } */ + +/** predirty_dn_diri + * predirty the directory inode for a new dentry, if it is auth (and not root) + * BUG: root inode doesn't get dirtied properly, currently. blech. + */ +version_t Server::predirty_dn_diri(CDentry *dn, EMetaBlob *blob, utime_t mtime) +{ + version_t dirpv = 0; + CInode *diri = dn->dir->inode; + + if (diri->is_auth() && !diri->is_root()) { + dirpv = diri->pre_dirty(); + inode_t *pi = blob->add_primary_dentry(diri->get_parent_dn(), true); + pi->version = dirpv; + pi->ctime = pi->mtime = mtime; + dout(10) << "predirty_dn_diri ctime/mtime " << mtime << " pv " << dirpv << " on " << *diri << endl; + } + + return dirpv; +} + +/** dirty_dn_diri + * follow-up with actual dirty of inode after journal entry commits. + */ +void Server::dirty_dn_diri(CDentry *dn, version_t dirpv, utime_t mtime) +{ + CInode *diri = dn->dir->inode; + + // make the udpate + diri->inode.ctime = diri->inode.mtime = mtime; + + if (diri->is_auth() && !diri->is_root()) { + // we're auth. + diri->mark_dirty(dirpv); + dout(10) << "dirty_dn_diri ctime/mtime " << mtime << " v " << diri->inode.version << " on " << *diri << endl; + } else { + // we're not auth. dirlock scatterlock will propagate the update. + } +} + + + + + + + + + // =============================================================================== // STAT @@ -1238,10 +1286,11 @@ class C_MDS_mknod_finish : public Context { CDentry *dn; CInode *newi; version_t pv; + version_t dirpv; public: - C_MDS_mknod_finish(MDS *m, MDRequest *r, CDentry *d, CInode *ni) : + C_MDS_mknod_finish(MDS *m, MDRequest *r, CDentry *d, CInode *ni, version_t dirpv_) : mds(m), mdr(r), dn(d), newi(ni), - pv(d->get_projected_version()) {} + pv(d->get_projected_version()), dirpv(dirpv_) {} void finish(int r) { assert(r == 0); @@ -1252,9 +1301,8 @@ public: newi->mark_dirty(pv); // dir inode's mtime - dn->get_dir()->get_inode()->inode.mtime = MAX(dn->get_dir()->get_inode()->inode.mtime, - newi->inode.ctime); - + mds->server->dirty_dn_diri(dn, dirpv, newi->inode.ctime); + // hit pop mds->balancer->hit_inode(newi, META_POP_IWR); @@ -1265,6 +1313,8 @@ public: } }; + + void Server::handle_client_mknod(MDRequest *mdr) { MClientRequest *req = mdr->client_request(); @@ -1282,14 +1332,17 @@ void Server::handle_client_mknod(MDRequest *mdr) newi->inode.mode |= INODE_MODE_FILE; // prepare finisher - C_MDS_mknod_finish *fin = new C_MDS_mknod_finish(mds, mdr, dn, newi); EUpdate *le = new EUpdate("mknod"); le->metablob.add_client_req(req->get_reqid()); + + version_t dirpv = predirty_dn_diri(dn, &le->metablob, newi->inode.ctime); // dir mtime too + le->metablob.add_dir_context(dn->dir); inode_t *pi = le->metablob.add_primary_dentry(dn, true, newi); pi->version = dn->get_projected_version(); // log + wait + C_MDS_mknod_finish *fin = new C_MDS_mknod_finish(mds, mdr, dn, newi, dirpv); mdlog->submit_entry(le); mdlog->wait_for_sync(fin); } @@ -1322,15 +1375,16 @@ void Server::handle_client_mkdir(MDRequest *mdr) newdir->mark_dirty(newdir->pre_dirty()); // prepare finisher - C_MDS_mknod_finish *fin = new C_MDS_mknod_finish(mds, mdr, dn, newi); EUpdate *le = new EUpdate("mkdir"); le->metablob.add_client_req(req->get_reqid()); + version_t dirpv = predirty_dn_diri(dn, &le->metablob, newi->inode.ctime); // dir mtime too le->metablob.add_dir_context(dn->dir); inode_t *pi = le->metablob.add_primary_dentry(dn, true, newi); pi->version = dn->get_projected_version(); le->metablob.add_dir(newdir, true); // log + wait + C_MDS_mknod_finish *fin = new C_MDS_mknod_finish(mds, mdr, dn, newi, dirpv); mdlog->submit_entry(le); mdlog->wait_for_sync(fin); @@ -1370,14 +1424,15 @@ void Server::handle_client_symlink(MDRequest *mdr) newi->symlink = req->get_sarg(); // prepare finisher - C_MDS_mknod_finish *fin = new C_MDS_mknod_finish(mds, mdr, dn, newi); EUpdate *le = new EUpdate("symlink"); le->metablob.add_client_req(req->get_reqid()); + version_t dirpv = predirty_dn_diri(dn, &le->metablob, newi->inode.ctime); // dir mtime too le->metablob.add_dir_context(dn->dir); inode_t *pi = le->metablob.add_primary_dentry(dn, true, newi); pi->version = dn->get_projected_version(); // log + wait + C_MDS_mknod_finish *fin = new C_MDS_mknod_finish(mds, mdr, dn, newi, dirpv); mdlog->submit_entry(le); mdlog->wait_for_sync(fin); } @@ -1490,15 +1545,17 @@ class C_MDS_link_local_finish : public Context { version_t dpv; utime_t tctime; version_t tpv; + version_t dirpv; public: - C_MDS_link_local_finish(MDS *m, MDRequest *r, CDentry *d, CInode *ti, utime_t ct) : + C_MDS_link_local_finish(MDS *m, MDRequest *r, CDentry *d, CInode *ti, version_t dirpv_, utime_t ct) : mds(m), mdr(r), dn(d), targeti(ti), dpv(d->get_projected_version()), tctime(ct), - tpv(targeti->get_parent_dn()->get_projected_version()) {} + tpv(targeti->get_parent_dn()->get_projected_version()), + dirpv(dirpv_) { } void finish(int r) { assert(r == 0); - mds->server->_link_local_finish(mdr, dn, targeti, dpv, tctime, tpv); + mds->server->_link_local_finish(mdr, dn, targeti, dpv, tctime, tpv, dirpv); } }; @@ -1517,6 +1574,8 @@ void Server::_link_local(MDRequest *mdr, CDentry *dn, CInode *targeti) version_t tpdv = targeti->pre_dirty(); // add to event + utime_t now = g_clock.real_now(); + version_t dirpv = predirty_dn_diri(dn, &le->metablob, now); // dir inode's mtime le->metablob.add_dir_context(dn->get_dir()); le->metablob.add_remote_dentry(dn, true, targeti->ino()); // new remote le->metablob.add_dir_context(targeti->get_parent_dir()); @@ -1524,11 +1583,11 @@ void Server::_link_local(MDRequest *mdr, CDentry *dn, CInode *targeti) // update journaled target inode pi->nlink++; - pi->ctime = g_clock.real_now(); + pi->ctime = now; pi->version = tpdv; // finisher - C_MDS_link_local_finish *fin = new C_MDS_link_local_finish(mds, mdr, dn, targeti, pi->ctime); + C_MDS_link_local_finish *fin = new C_MDS_link_local_finish(mds, mdr, dn, targeti, dirpv, now); // log + wait mdlog->submit_entry(le); @@ -1536,7 +1595,7 @@ void Server::_link_local(MDRequest *mdr, CDentry *dn, CInode *targeti) } void Server::_link_local_finish(MDRequest *mdr, CDentry *dn, CInode *targeti, - version_t dpv, utime_t tctime, version_t tpv) + version_t dpv, utime_t tctime, version_t tpv, version_t dirpv) { dout(10) << "_link_local_finish " << *dn << " to " << *targeti << endl; @@ -1551,8 +1610,7 @@ void Server::_link_local_finish(MDRequest *mdr, CDentry *dn, CInode *targeti, targeti->mark_dirty(tpv); // dir inode's mtime - dn->get_dir()->get_inode()->inode.mtime = MAX(dn->get_dir()->get_inode()->inode.mtime, - tctime); + dirty_dn_diri(dn, dirpv, tctime); // bump target popularity mds->balancer->hit_inode(targeti, META_POP_IWR); @@ -1738,16 +1796,17 @@ class C_MDS_unlink_local_finish : public Context { CDentry *straydn; version_t ipv; // referred inode utime_t ictime; - version_t dpv; // deleted dentry + version_t dnpv; // deleted dentry + version_t dirpv; public: C_MDS_unlink_local_finish(MDS *m, MDRequest *r, CDentry *d, CDentry *sd, - version_t v, utime_t ct) : + version_t v, version_t dirpv_, utime_t ct) : mds(m), mdr(r), dn(d), straydn(sd), ipv(v), ictime(ct), - dpv(d->get_projected_version()) { } + dnpv(d->get_projected_version()), dirpv(dirpv_) { } void finish(int r) { assert(r == 0); - mds->server->_unlink_local_finish(mdr, dn, straydn, ipv, ictime, dpv); + mds->server->_unlink_local_finish(mdr, dn, straydn, ipv, ictime, dnpv, dirpv); } }; @@ -1790,18 +1849,20 @@ void Server::_unlink_local(MDRequest *mdr, CDentry *dn) } // the unlinked dentry + utime_t now = g_clock.real_now(); dn->pre_dirty(); + version_t dirpv = predirty_dn_diri(dn, &le->metablob, now); le->metablob.add_dir_context(dn->get_dir()); le->metablob.add_null_dentry(dn, true); // update journaled target inode pi->nlink--; - pi->ctime = g_clock.real_now(); + pi->ctime = now; pi->version = ipv; // finisher C_MDS_unlink_local_finish *fin = new C_MDS_unlink_local_finish(mds, mdr, dn, straydn, - ipv, pi->ctime); + ipv, dirpv, now); journal_opens(); // journal pending opens, just in case @@ -1814,7 +1875,7 @@ void Server::_unlink_local(MDRequest *mdr, CDentry *dn) void Server::_unlink_local_finish(MDRequest *mdr, CDentry *dn, CDentry *straydn, - version_t ipv, utime_t ictime, version_t dpv) + version_t ipv, utime_t ictime, version_t dnpv, version_t dirpv) { dout(10) << "_unlink_local " << *dn << endl; @@ -1829,11 +1890,10 @@ void Server::_unlink_local_finish(MDRequest *mdr, in->inode.ctime = ictime; in->inode.nlink--; in->mark_dirty(ipv); // dirty inode - dn->mark_dirty(dpv); // dirty old dentry + dn->mark_dirty(dnpv); // dirty old dentry // dir inode's mtime - dn->get_dir()->get_inode()->inode.mtime = MAX(dn->get_dir()->get_inode()->inode.mtime, - ictime); + dirty_dn_diri(dn, dirpv, ictime); // share unlink news with replicas for (map::iterator it = dn->replicas_begin(); @@ -2147,25 +2207,27 @@ class C_MDS_rename_local_finish : public Context { version_t straypv; version_t destpv; version_t srcpv; + version_t ddirpv, sdirpv; utime_t ictime; public: version_t atid1; version_t atid2; C_MDS_rename_local_finish(MDS *m, MDRequest *r, CDentry *sdn, CDentry *ddn, CDentry *stdn, - version_t v, utime_t ct) : + version_t v, version_t ddirpv_, version_t sdirpv_, utime_t ct) : mds(m), mdr(r), srcdn(sdn), destdn(ddn), straydn(stdn), ipv(v), straypv(straydn ? straydn->get_projected_version():0), destpv(destdn->get_projected_version()), srcpv(srcdn->get_projected_version()), + ddirpv(ddirpv_), sdirpv(sdirpv_), ictime(ct), atid1(0), atid2(0) { } void finish(int r) { assert(r == 0); mds->server->_rename_local_finish(mdr, srcdn, destdn, straydn, - srcpv, destpv, straypv, ipv, ictime, + srcpv, destpv, straypv, ipv, ddirpv, sdirpv, ictime, atid1, atid2); } }; @@ -2194,6 +2256,8 @@ void Server::_rename_local(MDRequest *mdr, EUpdate *le = new EUpdate("rename_local"); le->metablob.add_client_req(mdr->reqid); + utime_t now = g_clock.real_now(); + CDentry *straydn = 0; inode_t *pi = 0; version_t ipv = 0; @@ -2204,9 +2268,14 @@ void Server::_rename_local(MDRequest *mdr, // primary+remote link merge? bool linkmerge = (srcdn->inode == destdn->inode && (srcdn->is_primary() || destdn->is_primary())); + + // dir mtimes + version_t ddirpv = predirty_dn_diri(destdn, &le->metablob, now); + version_t sdirpv = predirty_dn_diri(srcdn, &le->metablob, now); + if (linkmerge) { dout(10) << "will merge remote+primary links" << endl; - + // destdn -> primary le->metablob.add_dir_context(destdn->dir); ipv = destdn->pre_dirty(destdn->inode->inode.version); @@ -2300,13 +2369,13 @@ void Server::_rename_local(MDRequest *mdr, if (pi) { // update journaled target inode pi->nlink--; - pi->ctime = g_clock.real_now(); + pi->ctime = now; pi->version = ipv; } C_MDS_rename_local_finish *fin = new C_MDS_rename_local_finish(mds, mdr, srcdn, destdn, straydn, - ipv, pi ? pi->ctime:utime_t()); + ipv, ddirpv, sdirpv, now); journal_opens(); // journal pending opens, just in case @@ -2340,6 +2409,7 @@ void Server::_rename_local_reanchored(LogEvent *le, C_MDS_rename_local_finish *f void Server::_rename_local_finish(MDRequest *mdr, CDentry *srcdn, CDentry *destdn, CDentry *straydn, version_t srcpv, version_t destpv, version_t straypv, version_t ipv, + version_t ddirpv, version_t sdirpv, utime_t ictime, version_t atid1, version_t atid2) { @@ -2352,8 +2422,13 @@ void Server::_rename_local_finish(MDRequest *mdr, bool linkmerge = (srcdn->inode == destdn->inode && (srcdn->is_primary() || destdn->is_primary())); + // dir mtimes + dirty_dn_diri(destdn, ddirpv, ictime); + dirty_dn_diri(srcdn, sdirpv, ictime); + if (linkmerge) { assert(ipv); + if (destdn->is_primary()) { dout(10) << "merging remote onto primary link" << endl; @@ -2755,7 +2830,11 @@ void Server::_do_open(MDRequest *mdr, CInode *cur) << " on " << *cur << endl; // hit pop - mds->balancer->hit_inode(cur, META_POP_IRD); + if (cmode == FILE_MODE_RW || + cmode == FILE_MODE_W) + mds->balancer->hit_inode(cur, META_POP_IWR); + else + mds->balancer->hit_inode(cur, META_POP_IRD); // reply MClientReply *reply = new MClientReply(req, 0); diff --git a/branches/sage/pgs/mds/Server.h b/branches/sage/pgs/mds/Server.h index a132fc4501ad1..ef46c0a56c8d6 100644 --- a/branches/sage/pgs/mds/Server.h +++ b/branches/sage/pgs/mds/Server.h @@ -70,6 +70,9 @@ public: CDir* try_open_auth_dir(CInode *diri, frag_t fg, MDRequest *mdr); //CDir* try_open_dir(CInode *diri, frag_t fg, MDRequest *mdr); + version_t predirty_dn_diri(CDentry *dn, class EMetaBlob *blob, utime_t mtime); + void dirty_dn_diri(CDentry *dn, version_t dirpv, utime_t mtime); + // requests on existing inodes. void handle_client_stat(MDRequest *mdr); void handle_client_utime(MDRequest *mdr); @@ -108,7 +111,7 @@ public: void _link_local(MDRequest *mdr, CDentry *dn, CInode *targeti); void _link_local_finish(MDRequest *mdr, CDentry *dn, CInode *targeti, - version_t, utime_t, version_t); + version_t, utime_t, version_t, version_t); void _link_remote(MDRequest *mdr, CDentry *dn, CInode *targeti); // unlink @@ -117,7 +120,7 @@ public: void _unlink_local(MDRequest *mdr, CDentry *dn); void _unlink_local_finish(MDRequest *mdr, CDentry *dn, CDentry *straydn, - version_t, utime_t, version_t); + version_t, utime_t, version_t, version_t); void _unlink_remote(MDRequest *mdr, CDentry *dn); // rename @@ -134,7 +137,7 @@ public: void _rename_local_finish(MDRequest *mdr, CDentry *srcdn, CDentry *destdn, CDentry *straydn, version_t srcpv, version_t destpv, version_t straypv, version_t ipv, - utime_t ictime, + version_t ddirpv, version_t sdirpv, utime_t ictime, version_t atid1, version_t atid2); }; diff --git a/branches/sage/pgs/mds/SimpleLock.h b/branches/sage/pgs/mds/SimpleLock.h index 7df38a0a6bd1b..adb13dfc228c6 100644 --- a/branches/sage/pgs/mds/SimpleLock.h +++ b/branches/sage/pgs/mds/SimpleLock.h @@ -30,29 +30,31 @@ inline const char *get_lock_type_name(int t) { switch (t) { - case LOCK_OTYPE_DN: return "dentry"; - case LOCK_OTYPE_IFILE: return "inode_file"; - case LOCK_OTYPE_IAUTH: return "inode_auth"; - case LOCK_OTYPE_ILINK: return "inode_link"; - case LOCK_OTYPE_IDIRFRAGTREE: return "inode_dirfragtree"; - case LOCK_OTYPE_IDIR: return "inode_dir"; + case LOCK_OTYPE_DN: return "dn"; + case LOCK_OTYPE_IFILE: return "ifile"; + case LOCK_OTYPE_IAUTH: return "iauth"; + case LOCK_OTYPE_ILINK: return "ilink"; + case LOCK_OTYPE_IDIRFRAGTREE: return "idft"; + case LOCK_OTYPE_IDIR: return "idir"; default: assert(0); } } // -- lock states -- #define LOCK_UNDEF 0 -// auth rep +// auth rep #define LOCK_SYNC 1 // AR R . R . #define LOCK_LOCK 2 // AR R W . . #define LOCK_GLOCKR -3 // AR R . . . +#define LOCK_REMOTEXLOCK -50 // on NON-auth inline const char *get_simplelock_state_name(int n) { switch (n) { - case LOCK_UNDEF: return "undef"; + case LOCK_UNDEF: return "UNDEF"; case LOCK_SYNC: return "sync"; case LOCK_LOCK: return "lock"; case LOCK_GLOCKR: return "glockr"; + case LOCK_REMOTEXLOCK: return "remote_xlock"; default: assert(0); } } @@ -63,8 +65,7 @@ class SimpleLock { public: static const int WAIT_RD = (1<<0); // to read static const int WAIT_WR = (1<<1); // to write - static const int WAIT_NOLOCKS = (1<<2); // for last rdlock to finish - //static const int WAIT_LOCK = (1<<3); // for locked state + static const int WAIT_SINGLEAUTH = (1<<2); static const int WAIT_STABLE = (1<<3); // for a stable state static const int WAIT_REMOTEXLOCK = (1<<4); // for a remote xlock static const int WAIT_BITS = 5; @@ -248,7 +249,7 @@ public: virtual void print(ostream& out) { out << "("; - //out << get_lock_type_name(l.get_type()) << " "; + out << get_lock_type_name(get_type()) << " "; out << get_simplelock_state_name(get_state()); if (!get_gather_set().empty()) out << " g=" << get_gather_set(); if (is_rdlocked()) diff --git a/branches/sage/pgs/mds/events/EExport.h b/branches/sage/pgs/mds/events/EExport.h index 476d4fd9ffcae..29d8e0df08f49 100644 --- a/branches/sage/pgs/mds/events/EExport.h +++ b/branches/sage/pgs/mds/events/EExport.h @@ -40,7 +40,7 @@ public: set &get_bounds() { return bounds; } void print(ostream& out) { - out << "export " << base << " " << metablob; + out << "EExport " << base << " " << metablob; } virtual void encode_payload(bufferlist& bl) { diff --git a/branches/sage/pgs/mds/events/EImportFinish.h b/branches/sage/pgs/mds/events/EImportFinish.h index 8e26dfd035f20..0ee6d71ffdc13 100644 --- a/branches/sage/pgs/mds/events/EImportFinish.h +++ b/branches/sage/pgs/mds/events/EImportFinish.h @@ -33,7 +33,7 @@ class EImportFinish : public LogEvent { EImportFinish() : LogEvent(EVENT_IMPORTFINISH) { } void print(ostream& out) { - out << "import_finish " << base; + out << "EImportFinish " << base; if (success) out << " success"; else diff --git a/branches/sage/pgs/mds/mdstypes.h b/branches/sage/pgs/mds/mdstypes.h index 8966ba7b79f08..d54b52aa73d9b 100644 --- a/branches/sage/pgs/mds/mdstypes.h +++ b/branches/sage/pgs/mds/mdstypes.h @@ -292,8 +292,9 @@ class MDSCacheObject { } // -- state -- - const static int STATE_AUTH = (1<<30); - const static int STATE_DIRTY = (1<<29); + const static int STATE_AUTH = (1<<30); + const static int STATE_DIRTY = (1<<29); + const static int STATE_REJOINING = (1<<28); // replica has not joined w/ primary copy // -- wait -- const static int WAIT_SINGLEAUTH = (1<<30); @@ -327,8 +328,9 @@ class MDSCacheObject { void state_reset(unsigned s) { state = s; } bool is_auth() { return state_test(STATE_AUTH); } - bool is_dirty() { return state & STATE_DIRTY; } + bool is_dirty() { return state_test(STATE_DIRTY); } bool is_clean() { return !is_dirty(); } + bool is_rejoining() { return state_test(STATE_REJOINING); } // -------------------------------------------- // authority @@ -457,7 +459,7 @@ protected: if (waiting.empty()) get(PIN_WAITER); waiting.insert(pair(mask, c)); - dout(10) << (mdsco_db_line_prefix(this)) + pdout(10,g_conf.debug_mds) << (mdsco_db_line_prefix(this)) << "add_waiter " << mask << " " << c << " on " << *this << endl; @@ -469,14 +471,14 @@ protected: while (it != waiting.end()) { if (it->first & mask) { ls.push_back(it->second); - dout(10) << (mdsco_db_line_prefix(this)) + pdout(10,g_conf.debug_mds) << (mdsco_db_line_prefix(this)) << "take_waiting mask " << mask << " took " << it->second << " tag " << it->first << " on " << *this << endl; waiting.erase(it++); } else { - dout(10) << "take_waiting mask " << mask << " SKIPPING " << it->second + pdout(10,g_conf.debug_mds) << "take_waiting mask " << mask << " SKIPPING " << it->second << " tag " << it->first << " on " << *this << endl; diff --git a/branches/sage/pgs/messages/MClientReply.h b/branches/sage/pgs/messages/MClientReply.h index c7445b7476f20..e88c31ca47400 100644 --- a/branches/sage/pgs/messages/MClientReply.h +++ b/branches/sage/pgs/messages/MClientReply.h @@ -185,7 +185,9 @@ class MClientReply : public Message { virtual char *get_type_name() { return "creply"; } void print(ostream& o) { o << "creply(" << env.dst.name << "." << st.tid; - if (st.result) o << " = " << st.result; + o << " = " << st.result; + if (st.result <= 0) + o << " " << strerror(-st.result); o << ")"; } diff --git a/branches/sage/pgs/messages/MLock.h b/branches/sage/pgs/messages/MLock.h index 02229125cc521..890c536e75310 100644 --- a/branches/sage/pgs/messages/MLock.h +++ b/branches/sage/pgs/messages/MLock.h @@ -17,7 +17,7 @@ #define __MLOCK_H #include "msg/Message.h" - +#include "mds/SimpleLock.h" // for replicas #define LOCK_AC_SYNC -1 @@ -91,6 +91,12 @@ class MLock : public Message { data.claim(bl); } virtual char *get_type_name() { return "ILock"; } + void print(ostream& out) { + out << "lock(a=" << action + << " " << ino + << " " << get_lock_type_name(otype) + << ")"; + } void set_ino(inodeno_t ino, char ot) { otype = ot; @@ -111,32 +117,27 @@ class MLock : public Message { this->dn = dn; } void set_reqid(metareqid_t ri) { reqid = ri; } - void set_data(bufferlist& data) { - this->data.claim( data ); + void set_data(const bufferlist& data) { + this->data = data; } void decode_payload() { int off = 0; - payload.copy(off,sizeof(action), (char*)&action); - off += sizeof(action); - payload.copy(off,sizeof(asker), (char*)&asker); - off += sizeof(asker); - payload.copy(off,sizeof(otype), (char*)&otype); - off += sizeof(otype); - payload.copy(off,sizeof(ino), (char*)&ino); - off += sizeof(ino); - payload.copy(off,sizeof(dirfrag), (char*)&dirfrag); - off += sizeof(dirfrag); + ::_decode(action, payload, off); + ::_decode(asker, payload, off); + ::_decode(otype, payload, off); + ::_decode(ino, payload, off); + ::_decode(dirfrag, payload, off); ::_decode(reqid, payload, off); ::_decode(dn, payload, off); ::_decode(data, payload, off); } virtual void encode_payload() { - payload.append((char*)&action, sizeof(action)); - payload.append((char*)&asker, sizeof(asker)); - payload.append((char*)&otype, sizeof(otype)); - payload.append((char*)&ino, sizeof(ino)); - payload.append((char*)&dirfrag, sizeof(dirfrag)); + ::_encode(action, payload); + ::_encode(asker, payload); + ::_encode(otype, payload); + ::_encode(ino, payload); + ::_encode(dirfrag, payload); ::_encode(reqid, payload); ::_encode(dn, payload); ::_encode(data, payload); diff --git a/branches/sage/pgs/messages/MMDSCacheRejoin.h b/branches/sage/pgs/messages/MMDSCacheRejoin.h index 4ec9e3a6c8179..d524b9c2d4a02 100644 --- a/branches/sage/pgs/messages/MMDSCacheRejoin.h +++ b/branches/sage/pgs/messages/MMDSCacheRejoin.h @@ -45,11 +45,12 @@ class MMDSCacheRejoin : public Message { int32_t linklock; int32_t dirfragtreelock; int32_t filelock; + __int32_t dirlock; inode_strong() {} - inode_strong(int n, int cw=0, int a=0, int l=0, int dft=0, int f=0) : + inode_strong(int n, int cw=0, int a=0, int l=0, int dft=0, int f=0, int dl=0) : caps_wanted(cw), nonce(n), - authlock(a), linklock(l), dirfragtreelock(dft), filelock(f) { } + authlock(a), linklock(l), dirfragtreelock(dft), filelock(f), dirlock(dl) { } }; struct inode_full { inode_t inode; @@ -112,8 +113,8 @@ class MMDSCacheRejoin : public Message { void add_weak_inode(inodeno_t ino) { weak_inodes.insert(ino); } - void add_strong_inode(inodeno_t i, int n, int cw, int a, int l, int dft, int f) { - strong_inodes[i] = inode_strong(n, cw, a, l, dft, f); + void add_strong_inode(inodeno_t i, int n, int cw, int a, int l, int dft, int f, int dl) { + strong_inodes[i] = inode_strong(n, cw, a, l, dft, f, dl); } void add_full_inode(inode_t &i, const string& s, const fragtree_t &f) { full_inodes.push_back(inode_full(i, s, f)); diff --git a/branches/sage/pgs/messages/MOSDMap.h b/branches/sage/pgs/messages/MOSDMap.h index ee75f9bff9b69..83929ddd23c28 100644 --- a/branches/sage/pgs/messages/MOSDMap.h +++ b/branches/sage/pgs/messages/MOSDMap.h @@ -65,6 +65,9 @@ class MOSDMap : public Message { } virtual char *get_type_name() { return "omap"; } + void print(ostream& out) { + out << "osdmap(" << get_first() << "," << get_last() << ")"; + } }; #endif diff --git a/branches/sage/pgs/mon/OSDMonitor.cc b/branches/sage/pgs/mon/OSDMonitor.cc index d81b2de404428..c29e15a2ad3a2 100644 --- a/branches/sage/pgs/mon/OSDMonitor.cc +++ b/branches/sage/pgs/mon/OSDMonitor.cc @@ -414,6 +414,7 @@ void OSDMonitor::handle_osd_boot(MOSDBoot *m) bcast_latest_osd(); bcast_latest_mds(); + send_waiting(); } else { dout(7) << "osd_boot waiting for " << (osdmap.osds.size() - osdmap.osd_inst.size()) @@ -541,11 +542,26 @@ void OSDMonitor::send_waiting() for (map >::iterator i = awaiting_map.begin(); i != awaiting_map.end(); - i++) - send_incremental(i->second.second, i->second.first); + i++) { + if (i->second.second) + send_incremental(i->second.second, i->second.first); + else + send_full(i->second.first); + } } +void OSDMonitor::send_latest(entity_inst_t who) +{ + // FIXME this is super naive + if (osdmap.get_epoch() == 0) { + awaiting_map[who.name].first = who; + awaiting_map[who.name].second = 0; + } else { + send_full(who); + } +} + void OSDMonitor::send_full(entity_inst_t who) { messenger->send_message(new MOSDMap(&osdmap), who); diff --git a/branches/sage/pgs/mon/OSDMonitor.h b/branches/sage/pgs/mon/OSDMonitor.h index 18dc9f3a7943e..000a79f4024bc 100644 --- a/branches/sage/pgs/mon/OSDMonitor.h +++ b/branches/sage/pgs/mon/OSDMonitor.h @@ -103,9 +103,7 @@ private: void mark_all_down(); - void send_latest(entity_inst_t i) { - send_full(i); - } + void send_latest(entity_inst_t i); void fake_osd_failure(int osd, bool down); void fake_osdmap_update(); diff --git a/branches/sage/pgs/osdc/Journaler.cc b/branches/sage/pgs/osdc/Journaler.cc index 12285442d56f6..d94109864e148 100644 --- a/branches/sage/pgs/osdc/Journaler.cc +++ b/branches/sage/pgs/osdc/Journaler.cc @@ -169,7 +169,8 @@ void Journaler::write_head(Context *oncommit) bufferlist bl; bl.append((char*)&last_written, sizeof(last_written)); filer.write(inode, 0, bl.length(), bl, 0, - 0, new C_WriteHead(this, last_written, oncommit)); + 0, + new C_WriteHead(this, last_written, oncommit)); } void Journaler::_finish_write_head(Header &wrote, Context *oncommit) @@ -293,8 +294,10 @@ void Journaler::flush(Context *onsync) dout(10) << "flush flushing " << flush_pos << "~" << len << endl; // submit write for anything pending + // flush _start_ pos to _finish_flush filer.write(inode, flush_pos, len, write_buf, 0, - new C_Flush(this, flush_pos), 0); // flush _start_ pos to _finish_flush + g_conf.journaler_safe ? 0:new C_Flush(this, flush_pos), // on ACK + g_conf.journaler_safe ? new C_Flush(this, flush_pos):0); // on COMMIT pending_flush[flush_pos] = g_clock.now(); // adjust pointers -- 2.39.5