From 6da205a0a553fc521f74c3bab1539b6613e4f314 Mon Sep 17 00:00:00 2001 From: sageweil Date: Tue, 9 Oct 2007 18:32:36 +0000 Subject: [PATCH] segment changes, simplified shutdown, rejoin base inodes git-svn-id: https://ceph.svn.sf.net/svnroot/ceph@1899 29311d96-e01e-0410-9327-a35deaab8ce9 --- branches/sage/mds/Makefile | 2 +- branches/sage/mds/client/SyntheticClient.cc | 8 +- branches/sage/mds/config.cc | 16 +- branches/sage/mds/config.h | 4 +- branches/sage/mds/include/lru.h | 9 + branches/sage/mds/mds/CInode.cc | 11 + branches/sage/mds/mds/CInode.h | 10 +- branches/sage/mds/mds/IdAllocator.h | 1 + branches/sage/mds/mds/MDBalancer.cc | 4 +- branches/sage/mds/mds/MDCache.cc | 223 ++++++++++++-------- branches/sage/mds/mds/MDCache.h | 7 +- branches/sage/mds/mds/MDLog.cc | 119 ++++++----- branches/sage/mds/mds/MDLog.h | 20 +- branches/sage/mds/mds/MDS.cc | 12 +- branches/sage/mds/mds/Migrator.h | 3 + branches/sage/mds/mds/journal.cc | 15 +- branches/sage/mds/mds/mdstypes.h | 2 +- 17 files changed, 283 insertions(+), 183 deletions(-) diff --git a/branches/sage/mds/Makefile b/branches/sage/mds/Makefile index 393ede6e8c5ff..6295940ebc64e 100644 --- a/branches/sage/mds/Makefile +++ b/branches/sage/mds/Makefile @@ -16,7 +16,7 @@ EXTRA_CFLAGS = #-I${HOME}/include -L${HOME}/lib EXTRA_CFLAGS += -g EXTRA_CFLAGS += -pg -#EXTRA_CFLAGS += -O3 +EXTRA_CFLAGS += -O3 # base CFLAGS = -Wall -I. -D_FILE_OFFSET_BITS=64 -D_REENTRANT -D_THREAD_SAFE ${EXTRA_CFLAGS} diff --git a/branches/sage/mds/client/SyntheticClient.cc b/branches/sage/mds/client/SyntheticClient.cc index 2210ce5b9b21c..d742d3114bd01 100644 --- a/branches/sage/mds/client/SyntheticClient.cc +++ b/branches/sage/mds/client/SyntheticClient.cc @@ -272,6 +272,8 @@ int SyntheticClient::run() run_until = utime_t(0,0); dout(5) << "run" << dendl; + int seq = 0; + for (list::iterator it = modes.begin(); it != modes.end(); it++) { @@ -409,7 +411,7 @@ int SyntheticClient::run() break; case SYNCLIENT_MODE_MAKEDIRS: { - string sarg1 = get_sarg(0); + string sarg1 = get_sarg(seq++); int iarg1 = iargs.front(); iargs.pop_front(); int iarg2 = iargs.front(); iargs.pop_front(); int iarg3 = iargs.front(); iargs.pop_front(); @@ -1547,13 +1549,13 @@ int SyntheticClient::make_files(int num, int count, int priv, bool more) } } else { // shared - if (whoami == 0) { + if (true || whoami == 0) { for (int c=0; cmkdir(d, 0755); } } else { - sleep(5); + sleep(2); } } diff --git a/branches/sage/mds/config.cc b/branches/sage/mds/config.cc index 6be6a95b6702f..5cd3b8e3cf2c5 100644 --- a/branches/sage/mds/config.cc +++ b/branches/sage/mds/config.cc @@ -193,7 +193,7 @@ md_config_t g_conf = { journaler_batch_max: 16384, // max bytes we'll delay flushing // --- mds --- - mds_cache_size: MDS_CACHE_SIZE, + mds_cache_size: 300000, //MDS_CACHE_SIZE, mds_cache_mid: .7, mds_decay_halflife: 5, @@ -204,9 +204,8 @@ md_config_t g_conf = { mds_log: true, mds_log_max_events: -1, //MDS_CACHE_SIZE / 3, mds_log_max_segments: 100, - mds_log_max_trimming: 10, + mds_log_max_expiring: 20, mds_log_pad_entry: 128,//256,//64, - mds_log_flush_on_shutdown: true, mds_log_eopen_size: 100, // # open inodes per log entry mds_bal_sample_interval: 3.0, // every 5 seconds @@ -219,7 +218,7 @@ md_config_t g_conf = { mds_bal_merge_rd: 1000, mds_bal_merge_wr: 1000, mds_bal_interval: 10, // seconds - mds_bal_fragment_interval: 5, // seconds + mds_bal_fragment_interval: 2, // seconds mds_bal_idle_threshold: 0, //.1, mds_bal_max: -1, mds_bal_max_until: -1, @@ -233,7 +232,6 @@ md_config_t g_conf = { mds_bal_minchunk: .001, // never take anything smaller than this mds_trim_on_rejoin: true, - mds_commit_on_shutdown: true, mds_shutdown_check: 0, //30, mds_verify_export_dirauth: true, @@ -684,15 +682,11 @@ void parse_config_options(std::vector& args) g_conf.mds_log_max_events = atoi(args[++i]); else if (strcmp(args[i], "--mds_log_max_segments") == 0) g_conf.mds_log_max_segments = atoi(args[++i]); - else if (strcmp(args[i], "--mds_log_max_trimming") == 0) - g_conf.mds_log_max_trimming = atoi(args[++i]); + else if (strcmp(args[i], "--mds_log_max_expiring") == 0) + g_conf.mds_log_max_expiring = atoi(args[++i]); - else if (strcmp(args[i], "--mds_commit_on_shutdown") == 0) - g_conf.mds_commit_on_shutdown = atoi(args[++i]); else if (strcmp(args[i], "--mds_shutdown_check") == 0) g_conf.mds_shutdown_check = atoi(args[++i]); - else if (strcmp(args[i], "--mds_log_flush_on_shutdown") == 0) - g_conf.mds_log_flush_on_shutdown = atoi(args[++i]); else if (strcmp(args[i], "--mds_decay_halflife") == 0) g_conf.mds_decay_halflife = atoi(args[++i]); diff --git a/branches/sage/mds/config.h b/branches/sage/mds/config.h index a850b7462c202..2bc944f738200 100644 --- a/branches/sage/mds/config.h +++ b/branches/sage/mds/config.h @@ -197,9 +197,8 @@ struct md_config_t { bool mds_log; int mds_log_max_events; int mds_log_max_segments; - int mds_log_max_trimming; + int mds_log_max_expiring; int mds_log_pad_entry; - bool mds_log_flush_on_shutdown; int mds_log_eopen_size; float mds_bal_sample_interval; @@ -226,7 +225,6 @@ struct md_config_t { float mds_bal_minchunk; bool mds_trim_on_rejoin; - bool mds_commit_on_shutdown; int mds_shutdown_check; bool mds_verify_export_dirauth; // debug flag diff --git a/branches/sage/mds/include/lru.h b/branches/sage/mds/include/lru.h index 27edff8fdc00c..40dce1aa191ab 100644 --- a/branches/sage/mds/include/lru.h +++ b/branches/sage/mds/include/lru.h @@ -267,6 +267,15 @@ class LRU { return true; } + void lru_touch_entire_pintail() { + // promote entire pintail to the top lru + while (lru_pintail.get_length() > 0) { + LRUObject *o = lru_pintail.get_head(); + lru_pintail.remove(o); + lru_top.insert_tail(o); + } + } + // expire -- expire a single item LRUObject *lru_get_next_expire() { diff --git a/branches/sage/mds/mds/CInode.cc b/branches/sage/mds/mds/CInode.cc index af32163e0d141..5b604e6acb4f5 100644 --- a/branches/sage/mds/mds/CInode.cc +++ b/branches/sage/mds/mds/CInode.cc @@ -579,6 +579,17 @@ void CInode::decode_lock_state(int type, bufferlist& bl) } } +void CInode::clear_dirty_scattered(int type) +{ + dout(10) << "clear_dirty_scattered " << type << " on " << *this << dendl; + switch (type) { + case LOCK_OTYPE_IDIR: + xlist_dirty_inode_mtime.remove_myself(); + break; + default: + assert(0); + } +} diff --git a/branches/sage/mds/mds/CInode.h b/branches/sage/mds/mds/CInode.h index bde3d867cef79..73433d0cdd139 100644 --- a/branches/sage/mds/mds/CInode.h +++ b/branches/sage/mds/mds/CInode.h @@ -331,15 +331,7 @@ public: void encode_lock_state(int type, bufferlist& bl); void decode_lock_state(int type, bufferlist& bl); - void clear_dirty_scattered(int type) { - switch (type) { - case LOCK_OTYPE_IDIR: - xlist_dirty_inode_mtime.remove_myself(); - break; - default: - assert(0); - } - } + void clear_dirty_scattered(int type); // -- caps -- (new) // client caps diff --git a/branches/sage/mds/mds/IdAllocator.h b/branches/sage/mds/mds/IdAllocator.h index e8a0f5436938f..51001f2236627 100644 --- a/branches/sage/mds/mds/IdAllocator.h +++ b/branches/sage/mds/mds/IdAllocator.h @@ -55,6 +55,7 @@ class IdAllocator { version_t get_version() { return version; } version_t get_committed_version() { return committed_version; } + version_t get_committing_version() { return committing_version; } // load/save from disk (hack) bool is_undef() { return state == STATE_UNDEF; } diff --git a/branches/sage/mds/mds/MDBalancer.cc b/branches/sage/mds/mds/MDBalancer.cc index fdc2faba5def9..88e524fe11322 100644 --- a/branches/sage/mds/mds/MDBalancer.cc +++ b/branches/sage/mds/mds/MDBalancer.cc @@ -317,7 +317,7 @@ void MDBalancer::do_fragmenting() if (!dir->is_auth()) continue; dout(0) << "do_fragmenting splitting " << *dir << dendl; - mds->mdcache->split_dir(dir, 3); + mds->mdcache->split_dir(dir, 4); } split_queue.clear(); } @@ -339,6 +339,8 @@ void MDBalancer::do_rebalance(int beat) dout(5) << " do_rebalance: cluster loads are" << dendl; + mds->mdcache->migrator->clear_export_queue(); + // rescale! turn my mds_load back into meta_load units double load_fac = 1.0; if (mds_load[whoami].mds_load() > 0) { diff --git a/branches/sage/mds/mds/MDCache.cc b/branches/sage/mds/mds/MDCache.cc index 18d8a26844441..c0f128b0ea6b2 100644 --- a/branches/sage/mds/mds/MDCache.cc +++ b/branches/sage/mds/mds/MDCache.cc @@ -100,7 +100,6 @@ MDCache::MDCache(MDS *m) lru.lru_set_midpoint(g_conf.mds_cache_mid); did_shutdown_log_cap = false; - shutdown_commits = 0; } MDCache::~MDCache() @@ -1707,6 +1706,41 @@ void MDCache::rejoin_send_rejoins() rejoin_walk(dir, rejoins[auth]); } + + // rejoin root inodes, too + for (map::iterator p = rejoins.begin(); + p != rejoins.end(); + ++p) { + if (mds->is_rejoin()) { + // weak + if (p->first == 0 && root) + p->second->add_weak_inode(root->ino()); + if (get_inode(MDS_INO_STRAY(p->first))) + p->second->add_weak_inode(MDS_INO_STRAY(p->first)); + } else { + // strong + if (p->first == 0 && root) { + p->second->add_weak_inode(root->ino()); + p->second->add_strong_inode(root->ino(), root->get_replica_nonce(), + root->get_caps_wanted(), + root->authlock.get_state(), + root->linklock.get_state(), + root->dirfragtreelock.get_state(), + root->filelock.get_state(), + root->dirlock.get_state()); + } + if (CInode *in = get_inode(MDS_INO_STRAY(p->first))) { + p->second->add_weak_inode(in->ino()); + p->second->add_strong_inode(in->ino(), in->get_replica_nonce(), + in->get_caps_wanted(), + in->authlock.get_state(), + in->linklock.get_state(), + in->dirfragtreelock.get_state(), + in->filelock.get_state(), + in->dirlock.get_state()); + } + } + } if (!mds->is_rejoin()) { // i am survivor. send strong rejoin. @@ -2014,6 +2048,28 @@ void MDCache::handle_cache_rejoin_weak(MMDSCacheRejoin *weak) } } + // weak base inodes? (root, stray, etc.) + for (set::iterator p = weak->weak_inodes.begin(); + p != weak->weak_inodes.end(); + ++p) { + CInode *in = get_inode(*p); + assert(in); // hmm fixme wrt stray? + if (survivor && in->is_replica(from)) + inode_remove_replica(in, from); // this induces a lock gather completion + int inonce = in->add_replica(from); + dout(10) << " have base " << *in << dendl; + + if (ack) + ack->add_strong_inode(in->ino(), + inonce, + 0, + in->authlock.get_replica_state(), + in->linklock.get_replica_state(), + in->dirfragtreelock.get_replica_state(), + in->filelock.get_replica_state(), + in->dirlock.get_replica_state()); + } + // full inodes? // dirty scatterlock content! for (list::iterator p = weak->full_inodes.begin(); @@ -2308,6 +2364,15 @@ void MDCache::handle_cache_rejoin_strong(MMDSCacheRejoin *strong) } } + // base inodes? (root, stray, etc.) + for (set::iterator p = strong->weak_inodes.begin(); + p != strong->weak_inodes.end(); + ++p) { + CInode *in = get_inode(*p); + dout(10) << " have base " << *in << dendl; + in->add_replica(from); + } + // send missing? if (missing) { // we expect a FULL soon. @@ -2686,6 +2751,32 @@ void MDCache::rejoin_send_acks() } } + // root inodes too + if (root) + for (map::iterator r = root->replicas_begin(); + r != root->replicas_end(); + ++r) { + ack[r->first]->add_full_inode(root->inode, root->symlink, root->dirfragtree); + ack[r->first]->add_strong_inode(root->ino(), r->second, 0, + root->authlock.get_replica_state(), + root->linklock.get_replica_state(), + root->dirfragtreelock.get_replica_state(), + root->filelock.get_replica_state(), + root->dirlock.get_replica_state()); + } + if (stray) + for (map::iterator r = stray->replicas_begin(); + r != stray->replicas_end(); + ++r) { + ack[r->first]->add_full_inode(stray->inode, stray->symlink, stray->dirfragtree); + ack[r->first]->add_strong_inode(stray->ino(), r->second, 0, + stray->authlock.get_replica_state(), + stray->linklock.get_replica_state(), + stray->dirfragtreelock.get_replica_state(), + stray->filelock.get_replica_state(), + stray->dirlock.get_replica_state()); + } + // send acks for (map::iterator p = ack.begin(); p != ack.end(); @@ -3105,6 +3196,8 @@ void MDCache::trim_inode(CDentry *dn, CInode *in, CDir *con, mapmdc = mdc; - } - void finish(int r) { - mdc->shutdown_commits--; - } -}; - class C_MDC_ShutdownCheck : public Context { MDCache *mdc; public: @@ -3452,6 +3537,8 @@ void MDCache::shutdown_start() if (g_conf.mds_shutdown_check) mds->timer.add_event_after(g_conf.mds_shutdown_check, new C_MDC_ShutdownCheck(this)); + + // g_conf.debug_mds = 10; } @@ -3467,60 +3554,27 @@ bool MDCache::shutdown_pass() return true; } - // commit dirs? - if (g_conf.mds_commit_on_shutdown) { - - if (shutdown_commits < 0) { - dout(1) << "shutdown_pass committing all dirty dirs" << dendl; - shutdown_commits = 0; - - for (hash_map::iterator it = inode_map.begin(); - it != inode_map.end(); - it++) { - CInode *in = it->second; - if (!in->is_dir()) continue; - - // commit any dirty dirfrag that's ours - list dfs; - in->get_dirfrags(dfs); - for (list::iterator p = dfs.begin(); p != dfs.end(); ++p) { - CDir *dir = *p; - if (dir->is_auth() && dir->is_dirty()) { - dir->commit(0, new C_MDC_ShutdownCommit(this)); - shutdown_commits++; - } - } - } - } - - // commits? - if (shutdown_commits > 0) { - dout(7) << "shutdown_commits still waiting for " << shutdown_commits << dendl; - return false; - } - } - - // flush anything we can from the cache - trim(0); - dout(5) << "lru size now " << lru.lru_get_size() << dendl; - // flush batching eopens, so that we can properly expire them. mds->server->journal_opens(); // hrm, this is sort of a hack. // flush what we can from the log - if (g_conf.mds_log_flush_on_shutdown) { - mds->mdlog->set_max_events(0); - mds->mdlog->trim(); + mds->mdlog->set_max_events(0); + mds->mdlog->trim(); + + if (mds->mdlog->get_num_segments() > 1) { + dout(7) << "still >1 segments, waiting for log to trim" << dendl; + return false; } + trim(0); + dout(5) << "lru size now " << lru.lru_get_size() << dendl; + // SUBTREES - // send all imports back to 0. if (!subtrees.empty() && mds->get_nodeid() != 0 && !migrator->is_exporting() //&& //!migrator->is_importing() ) { - // export to root dout(7) << "looking for subtrees to export to mds0" << dendl; list ls; for (map >::iterator it = subtrees.begin(); @@ -3535,20 +3589,23 @@ bool MDCache::shutdown_pass() int max = 5; // throttle shutdown exports.. hack! for (list::iterator p = ls.begin(); p != ls.end(); ++p) { CDir *dir = *p; - dout(7) << "sending " << *dir << " back to mds0" << dendl; - migrator->export_dir(dir, 0); + int dest = dir->get_inode()->authority().first; + if (dest > 0 && !mds->mdsmap->is_active(dest)) dest = 0; + dout(7) << "sending " << *dir << " back to mds" << dest << dendl; + migrator->export_dir(dir, dest); if (--max == 0) break; } } + // subtrees map not empty yet? if (!subtrees.empty()) { dout(7) << "still have " << num_subtrees() << " subtrees" << dendl; show_subtrees(); migrator->show_importing(); migrator->show_exporting(); - if (!migrator->is_importing() && !migrator->is_exporting()) - show_cache(); + //if (!migrator->is_importing() && !migrator->is_exporting()) + //show_cache(); return false; } assert(subtrees.empty()); @@ -3556,35 +3613,32 @@ bool MDCache::shutdown_pass() assert(!migrator->is_importing()); + // empty out stray contents // FIXME dout(7) << "FIXME: i need to empty out stray dir contents..." << dendl; - // cap log? - if (g_conf.mds_log_flush_on_shutdown) { - - // (only do this once!) - if (!mds->mdlog->is_capped()) { - dout(7) << "capping the log" << dendl; - mds->mdlog->cap(); - mds->mdlog->trim(); - } - - if (!mds->mdlog->empty()) { - dout(7) << "waiting for log to flush.. " << mds->mdlog->get_num_events() - << " in " << mds->mdlog->get_num_segments() << " segments" << dendl; - return false; - } - - if (!did_shutdown_log_cap) { - // flush journal header - dout(7) << "writing header for (now-empty) journal" << dendl; - assert(mds->mdlog->empty()); - mds->mdlog->write_head(0); - // NOTE: filer active checker below will block us until this completes. - did_shutdown_log_cap = true; - return false; - } + // (only do this once!) + if (!mds->mdlog->is_capped()) { + dout(7) << "capping the log" << dendl; + mds->mdlog->cap(); + mds->mdlog->trim(); + } + + if (!mds->mdlog->empty()) { + dout(7) << "waiting for log to flush.. " << mds->mdlog->get_num_events() + << " in " << mds->mdlog->get_num_segments() << " segments" << dendl; + return false; + } + + if (!did_shutdown_log_cap) { + // flush journal header + dout(7) << "writing header for (now-empty) journal" << dendl; + assert(mds->mdlog->empty()); + mds->mdlog->write_head(0); + // NOTE: filer active checker below will block us until this completes. + did_shutdown_log_cap = true; + return false; } // filer active? @@ -3593,8 +3647,7 @@ bool MDCache::shutdown_pass() return false; } - - // done? + // trim what we can from the cache if (lru.lru_get_size() > 0) { dout(7) << "there's still stuff in the cache: " << lru.lru_get_size() << dendl; show_cache(); diff --git a/branches/sage/mds/mds/MDCache.h b/branches/sage/mds/mds/MDCache.h index d738b8ccd4962..d67d5a4cdf32b 100644 --- a/branches/sage/mds/mds/MDCache.h +++ b/branches/sage/mds/mds/MDCache.h @@ -333,11 +333,6 @@ public: map > purging_ls; map > > waiting_for_purge; - // shutdown crap - int shutdown_commits; - bool did_shutdown_log_cap; - friend class C_MDC_ShutdownCommit; - // -- recovery -- protected: set recovery_set; @@ -470,6 +465,8 @@ public: bool shutdown_pass(); bool shutdown(); // clear cache (ie at shutodwn) + bool did_shutdown_log_cap; + // inode_map bool have_inode( inodeno_t ino ) { return inode_map.count(ino) ? true:false; } CInode* get_inode( inodeno_t ino ) { diff --git a/branches/sage/mds/mds/MDLog.cc b/branches/sage/mds/mds/MDLog.cc index 0b069bc71f907..205ce60eadf8a 100644 --- a/branches/sage/mds/mds/MDLog.cc +++ b/branches/sage/mds/mds/MDLog.cc @@ -53,15 +53,22 @@ void MDLog::reopen_logger(utime_t start, bool append) if (!didit) { didit = true; mdlog_logtype.add_inc("evadd"); + mdlog_logtype.add_inc("evex"); mdlog_logtype.add_inc("evtrm"); - mdlog_logtype.add_set("evtrmg"); mdlog_logtype.add_set("ev"); + mdlog_logtype.add_set("evexg"); + mdlog_logtype.add_set("evexd"); + mdlog_logtype.add_inc("segadd"); - mdlog_logtype.add_inc("segtrm"); - mdlog_logtype.add_set("segtrmg"); + mdlog_logtype.add_inc("segex"); + mdlog_logtype.add_inc("segtrm"); mdlog_logtype.add_set("seg"); + mdlog_logtype.add_set("segexg"); + mdlog_logtype.add_set("segexd"); + mdlog_logtype.add_set("expos"); mdlog_logtype.add_set("wrpos"); + mdlog_logtype.add_avg("jlat"); } @@ -262,7 +269,8 @@ void MDLog::trim() dout(10) << "trim " << segments.size() << " / " << max_segments << " segments, " << num_events << " / " << max_events << " events" - << ", " << trimming_segments.size() << " (" << trimming_events << ") trimming" + << ", " << expiring_segments.size() << " (" << expiring_events << ") expiring" + << ", " << expired_segments.size() << " (" << expired_events << ") expired" << dendl; if (segments.empty()) return; @@ -274,13 +282,13 @@ void MDLog::trim() map::iterator p = segments.begin(); int left = num_events; while (p != segments.end() && - ((max_events >= 0 && left-trimming_events > max_events) || - (max_segments >= 0 && (int)(segments.size()-trimming_segments.size()) > max_segments))) { + ((max_events >= 0 && left-expiring_events-expired_events > max_events) || + (max_segments >= 0 && (int)(segments.size()-expiring_segments.size()-expired_segments.size()) > max_segments))) { if (stop < g_clock.now()) break; - if ((int)trimming_segments.size() >= g_conf.mds_log_max_trimming) + if ((int)expiring_segments.size() >= g_conf.mds_log_max_expiring) break; // look at first segment @@ -289,10 +297,12 @@ void MDLog::trim() p++; - if (trimming_segments.count(ls)) { - dout(5) << "trim already trimming segment " << ls->offset << ", " << ls->num_events << " events" << dendl; + if (expiring_segments.count(ls)) { + dout(5) << "trim already expiring segment " << ls->offset << ", " << ls->num_events << " events" << dendl; + } else if (expired_segments.count(ls)) { + dout(5) << "trim already expired segment " << ls->offset << ", " << ls->num_events << " events" << dendl; } else { - try_trim(ls); + try_expire(ls); } left -= ls->num_events; @@ -300,57 +310,71 @@ void MDLog::trim() } -void MDLog::try_trim(LogSegment *ls) +void MDLog::try_expire(LogSegment *ls) { C_Gather *exp = ls->try_to_expire(mds); if (exp) { - trimming_segments.insert(ls); - trimming_events += ls->num_events; - dout(5) << "try_trim trimming segment " << ls->offset << dendl; - exp->set_finisher(new C_MaybeTrimmedSegment(this, ls)); + assert(expiring_segments.count(ls) == 0); + expiring_segments.insert(ls); + expiring_events += ls->num_events; + dout(5) << "try_expire expiring segment " << ls->offset << dendl; + exp->set_finisher(new C_MaybeExpiredSegment(this, ls)); } else { - dout(10) << "try_trim trimmed segment " << ls->offset << dendl; - _trimmed(ls); + dout(10) << "try_expire expired segment " << ls->offset << dendl; + _expired(ls); } - logger->set("segtrmg", trimming_segments.size()); - logger->set("evtrmg", trimming_events); + logger->set("segexg", expiring_segments.size()); + logger->set("evexg", expiring_events); } -void MDLog::_maybe_trimmed(LogSegment *ls) +void MDLog::_maybe_expired(LogSegment *ls) { - dout(10) << "_maybe_trimmed segment " << ls->offset << " " << ls->num_events << " events" << dendl; - assert(trimming_segments.count(ls)); - trimming_segments.erase(ls); - trimming_events -= ls->num_events; - try_trim(ls); + dout(10) << "_maybe_expired segment " << ls->offset << " " << ls->num_events << " events" << dendl; + assert(expiring_segments.count(ls)); + expiring_segments.erase(ls); + expiring_events -= ls->num_events; + try_expire(ls); } -void MDLog::_trimmed(LogSegment *ls) +void MDLog::_expired(LogSegment *ls) { - dout(5) << "_trimmed segment " << ls->offset << " " << ls->num_events << " events" << dendl; + dout(5) << "_expired segment " << ls->offset << " " << ls->num_events << " events" << dendl; - // don't trim last segment, unless we're capped if (!capped && ls == get_current_segment()) { - dout(5) << "_trimmed not trimming " << ls->offset << ", last one and !capped" << dendl; - return; - } - - num_events -= ls->num_events; - - assert(segments.count(ls->offset)); - if (segments.begin()->second == ls) { - journaler->set_expire_pos(ls->offset); // this was the oldest segment, adjust expire pos - logger->set("expos", ls->offset); + dout(5) << "_expired not expiring " << ls->offset << ", last one and !capped" << dendl; + } else { + // expired. + expired_segments.insert(ls); + expired_events += ls->num_events; + + logger->inc("evex", ls->num_events); + logger->inc("segex"); + + // trim expired segments? + while (!segments.empty()) { + ls = segments.begin()->second; + if (!expired_segments.count(ls)) break; + + expired_events -= ls->num_events; + expired_segments.erase(ls); + num_events -= ls->num_events; + + journaler->set_expire_pos(ls->offset); // this was the oldest segment, adjust expire pos + + logger->set("expos", ls->offset); + logger->inc("segtrm"); + logger->inc("evtrm", ls->num_events); + + segments.erase(ls->offset); + delete ls; + } } - segments.erase(ls->offset); logger->set("ev", num_events); - logger->inc("evtrm", ls->num_events); + logger->set("evexd", expired_events); logger->set("seg", segments.size()); - logger->inc("segtrm"); - - delete ls; + logger->set("segexd", expired_segments.size()); } @@ -383,7 +407,6 @@ void MDLog::replay(Context *c) assert(num_events == 0); replay_thread.create(); - //_replay(); } class C_MDL_Replay : public Context { @@ -392,7 +415,6 @@ public: C_MDL_Replay(MDLog *l) : mdlog(l) {} void finish(int r) { mdlog->replay_cond.Signal(); - //mdlog->_replay(); } }; @@ -435,8 +457,6 @@ void MDLog::_replay_thread() logger->set("seg", segments.size()); } - le->_segment = get_current_segment(); // replay may need this - // have we seen an import map yet? if (segments.empty()) { dout(10) << "_replay " << pos << " / " << journaler->get_write_pos() @@ -444,9 +464,12 @@ void MDLog::_replay_thread() } else { dout(10) << "_replay " << pos << " / " << journaler->get_write_pos() << " : " << *le << dendl; + le->_segment = get_current_segment(); // replay may need this + le->_segment->num_events++; + num_events++; + le->replay(mds); - num_events++; if (!new_expire_pos) new_expire_pos = pos; } diff --git a/branches/sage/mds/mds/MDLog.h b/branches/sage/mds/mds/MDLog.h index 5c1f9260e2601..f7bdcd21a5303 100644 --- a/branches/sage/mds/mds/MDLog.h +++ b/branches/sage/mds/mds/MDLog.h @@ -82,8 +82,10 @@ class MDLog { // -- segments -- map segments; - set trimming_segments; - int trimming_events; + set expiring_segments; + set expired_segments; + int expiring_events; + int expired_events; class C_MDL_WroteSubtreeMap : public Context { MDLog *mdlog; @@ -132,7 +134,7 @@ public: journaler(0), logger(0), replay_thread(this), - trimming_events(0), + expiring_events(0), expired_events(0), writing_subtree_map(false) { } ~MDLog(); @@ -163,19 +165,19 @@ public: void flush(); private: - class C_MaybeTrimmedSegment : public Context { + class C_MaybeExpiredSegment : public Context { MDLog *mdlog; LogSegment *ls; public: - C_MaybeTrimmedSegment(MDLog *mdl, LogSegment *s) : mdlog(mdl), ls(s) {} + C_MaybeExpiredSegment(MDLog *mdl, LogSegment *s) : mdlog(mdl), ls(s) {} void finish(int res) { - mdlog->_maybe_trimmed(ls); + mdlog->_maybe_expired(ls); } }; - void try_trim(LogSegment *ls); - void _maybe_trimmed(LogSegment *ls); - void _trimmed(LogSegment *ls); + void try_expire(LogSegment *ls); + void _maybe_expired(LogSegment *ls); + void _expired(LogSegment *ls); public: void trim(); diff --git a/branches/sage/mds/mds/MDS.cc b/branches/sage/mds/mds/MDS.cc index 14452eb21c470..675e577adea82 100644 --- a/branches/sage/mds/mds/MDS.cc +++ b/branches/sage/mds/mds/MDS.cc @@ -1078,14 +1078,18 @@ void MDS::my_dispatch(Message *m) mdsmap->get_inst(from) != m->get_source_inst() || mdsmap->is_down(from)) { // bogus mds? - if (m->get_type() != MSG_MDS_MAP) { + if (m->get_type() == MSG_MDS_MAP) { + dout(5) << "got " << *m << " from old/bad/imposter mds " << m->get_source() + << ", but it's an mdsmap, looking at it" << dendl; + } else if (m->get_type() == MSG_MDS_CACHEEXPIRE && + mdsmap->get_inst(from) == m->get_source_inst()) { + dout(5) << "got " << *m << " from down mds " << m->get_source() + << ", but it's a cache_expire, looking at it" << dendl; + } else { dout(5) << "got " << *m << " from down/old/bad/imposter mds " << m->get_source() << ", dropping" << dendl; delete m; return; - } else { - dout(5) << "got " << *m << " from old/bad/imposter mds " << m->get_source() - << ", but it's an mdsmap, looking at it" << dendl; } } } diff --git a/branches/sage/mds/mds/Migrator.h b/branches/sage/mds/mds/Migrator.h index c9ab047f20aec..c4ee234546c3b 100644 --- a/branches/sage/mds/mds/Migrator.h +++ b/branches/sage/mds/mds/Migrator.h @@ -182,6 +182,9 @@ public: void export_dir_nicely(CDir *dir, int dest); void maybe_do_queued_export(); + void clear_export_queue() { + export_queue.clear(); + } void encode_export_inode(CInode *in, bufferlist& enc_state, map& exported_client_map); diff --git a/branches/sage/mds/mds/journal.cc b/branches/sage/mds/mds/journal.cc index 243acc037f740..1837f5ab9ff82 100644 --- a/branches/sage/mds/mds/journal.cc +++ b/branches/sage/mds/mds/journal.cc @@ -97,8 +97,9 @@ C_Gather *LogSegment::try_to_expire(MDS *mds) } // dirty non-auth mtimes - for (xlist::iterator p = dirty_inode_mtimes.begin(); !p.end(); ++p) { - dout(10) << "try_to_expire waiting for dirlock mtime flush on " << *p << dendl; + if(0) //fuckfuck + for (xlist::iterator p = dirty_inode_mtimes.begin(); !p.end(); ++p) { + dout(10) << "try_to_expire waiting for dirlock mtime flush on " << **p << dendl; if (!gather) gather = new C_Gather; (*p)->dirlock.add_waiter(SimpleLock::WAIT_STABLE, gather->new_sub()); } @@ -118,7 +119,10 @@ C_Gather *LogSegment::try_to_expire(MDS *mds) // idalloc if (allocv > mds->idalloc->get_committed_version()) { - dout(10) << "try_to_expire saving idalloc table, need " << allocv << dendl; + dout(10) << "try_to_expire saving idalloc table, need " << allocv + << ", committed is " << mds->idalloc->get_committed_version() + << " (" << mds->idalloc->get_committing_version() << ")" + << dendl; if (!gather) gather = new C_Gather; mds->idalloc->save(gather->new_sub(), allocv); } @@ -151,6 +155,11 @@ C_Gather *LogSegment::try_to_expire(MDS *mds) // FIXME client requests...? // audit handling of anchor transactions? + if (gather) { + dout(6) << "LogSegment(" << offset << ").try_to_expire waiting" << dendl; + } else { + dout(6) << "LogSegment(" << offset << ").try_to_expire success" << dendl; + } return gather; } diff --git a/branches/sage/mds/mds/mdstypes.h b/branches/sage/mds/mds/mdstypes.h index 3992c7a61b7ac..71e24d5cc5555 100644 --- a/branches/sage/mds/mds/mdstypes.h +++ b/branches/sage/mds/mds/mdstypes.h @@ -631,7 +631,7 @@ protected: virtual void add_lock_waiter(int type, int mask, Context *c) { assert(0); } virtual bool is_lock_waiting(int type, int mask) { assert(0); return false; } - virtual void clear_dirty_scattered(int type) { } + virtual void clear_dirty_scattered(int type) { assert(0); } // --------------------------------------------- // ordering -- 2.39.5