From 2f86da0d1b37a063b641e90b935b81833311ebcf Mon Sep 17 00:00:00 2001 From: sageweil Date: Tue, 21 Nov 2006 00:15:25 +0000 Subject: [PATCH] tons of mds recovery stuffs git-svn-id: https://ceph.svn.sf.net/svnroot/ceph@964 29311d96-e01e-0410-9327-a35deaab8ce9 --- ceph/Makefile | 1 + ceph/mds/CDentry.h | 4 +- ceph/mds/CDir.cc | 2 +- ceph/mds/CDir.h | 20 +- ceph/mds/CInode.cc | 22 +- ceph/mds/CInode.h | 35 ++- ceph/mds/LogEvent.cc | 10 + ceph/mds/LogEvent.h | 13 +- ceph/mds/MDCache.cc | 115 +++++++- ceph/mds/MDCache.h | 22 +- ceph/mds/MDLog.cc | 12 +- ceph/mds/MDS.cc | 16 +- ceph/mds/MDS.h | 4 - ceph/mds/MDStore.cc | 114 +++++--- ceph/mds/Migrator.cc | 10 +- ceph/mds/Server.cc | 50 +++- ceph/mds/Server.h | 1 + ceph/mds/events/EDirUpdate.h | 21 +- ceph/mds/events/EInodeUpdate.h | 81 ++---- ceph/mds/events/EMkdir.h | 62 +++++ ceph/mds/events/EMknod.h | 59 ++-- ceph/mds/events/EPurgeFinish.h | 49 ++++ ceph/mds/events/{ETraced.h => ETrace.h} | 57 ++-- ceph/mds/events/EUnlink.h | 77 ++---- ceph/mds/journal.cc | 345 ++++++++++++++++++++++++ 25 files changed, 894 insertions(+), 308 deletions(-) create mode 100644 ceph/mds/events/EMkdir.h create mode 100644 ceph/mds/events/EPurgeFinish.h rename ceph/mds/events/{ETraced.h => ETrace.h} (67%) create mode 100644 ceph/mds/journal.cc diff --git a/ceph/Makefile b/ceph/Makefile index 6ff4c5649699f..1681ac16698a8 100644 --- a/ceph/Makefile +++ b/ceph/Makefile @@ -30,6 +30,7 @@ EBOFS_OBJS= \ MDS_OBJS= \ mds/MDS.o\ + mds/journal.o\ mds/Server.o\ mds/MDCache.o\ mds/Locker.o\ diff --git a/ceph/mds/CDentry.h b/ceph/mds/CDentry.h index b8469db172f5f..a399ef7acfe5a 100644 --- a/ceph/mds/CDentry.h +++ b/ceph/mds/CDentry.h @@ -45,8 +45,8 @@ class CDentry { inodeno_t remote_ino; // if remote dentry // state - bool dirty; - __uint64_t parent_dir_version; // dir version when last touched. + bool dirty; + version_t parent_dir_version; // dir version when last touched. // locking int lockstate; diff --git a/ceph/mds/CDir.cc b/ceph/mds/CDir.cc index 84fd7a9bd978a..a590e6821e1de 100644 --- a/ceph/mds/CDir.cc +++ b/ceph/mds/CDir.cc @@ -27,7 +27,7 @@ #include "config.h" #undef dout -#define dout(x) if (x <= g_conf.debug || x <= g_conf.debug_mds) cout << g_clock.now() << " mds" << mds->get_nodeid() << " cdir: " +#define dout(x) if (x <= g_conf.debug || x <= g_conf.debug_mds) cout << g_clock.now() << " mds" << mds->get_nodeid() << ".cache.dir(" << inode->inode.ino << ") " // PINS diff --git a/ceph/mds/CDir.h b/ceph/mds/CDir.h index e0d8bdae8d507..a1e857a72f9f9 100644 --- a/ceph/mds/CDir.h +++ b/ceph/mds/CDir.h @@ -203,9 +203,9 @@ class CDir { // state unsigned state; - __uint64_t version; - __uint64_t committing_version; - __uint64_t last_committed_version; + version_t version; + version_t committing_version; + version_t last_committed_version; // authority, replicas set open_by; // nodes that have me open @@ -410,16 +410,18 @@ class CDir { // -- dirtyness -- - __uint64_t get_version() { return version; } - void float_version(__uint64_t ge) { + version_t get_version() { return version; } + void float_version(version_t ge) { if (version < ge) version = ge; } - __uint64_t get_committing_version() { return committing_version; } - __uint64_t get_last_committed_version() { return last_committed_version; } + void set_version(version_t v) { version = v; } + + version_t get_committing_version() { return committing_version; } + version_t get_last_committed_version() { return last_committed_version; } // as in, we're committing the current version. void set_committing_version() { committing_version = version; } - void set_last_committed_version(__uint64_t v) { last_committed_version = v; } + void set_last_committed_version(version_t v) { last_committed_version = v; } void mark_dirty(); void mark_clean(); void mark_complete() { state_set(CDIR_STATE_COMPLETE); } @@ -570,7 +572,7 @@ typedef struct { inodeno_t ino; __uint64_t nitems; // actual real entries __uint64_t nden; // num dentries (including null ones) - __uint64_t version; + version_t version; unsigned state; meta_load_t popularity_justme; meta_load_t popularity_curdom; diff --git a/ceph/mds/CInode.cc b/ceph/mds/CInode.cc index 581705acfb74f..1c24434e6baac 100644 --- a/ceph/mds/CInode.cc +++ b/ceph/mds/CInode.cc @@ -18,6 +18,7 @@ #include "CDentry.h" #include "MDS.h" +#include "MDCache.h" #include "AnchorTable.h" #include "common/Clock.h" @@ -26,7 +27,7 @@ #include "config.h" #undef dout -#define dout(x) if (x <= g_conf.debug || x <= g_conf.debug_mds) cout << g_clock.now() << " cinode: " +#define dout(x) if (x <= g_conf.debug || x <= g_conf.debug_mds) cout << g_clock.now() << " mds" << mdcache->mds->get_nodeid() << ".cache.inode(" << inode.ino << ") " int cinode_pins[CINODE_NUM_PINS]; // counts @@ -56,6 +57,8 @@ ostream& operator<<(ostream& out, CInode& in) if (in.is_symlink()) out << " symlink"; + out << " v" << in.get_version(); + out << " hard=" << in.hardlock; out << " file=" << in.filelock; @@ -88,7 +91,9 @@ ostream& operator<<(ostream& out, CInode& in) // ====== CInode ======= -CInode::CInode(bool auth) : LRUObject() { +CInode::CInode(MDCache *c, bool auth) : LRUObject() { + mdcache = c; + ref = 0; parent = NULL; @@ -100,7 +105,8 @@ CInode::CInode(bool auth) : LRUObject() { num_request_pins = 0; state = 0; - version = 0; + + committing_version = committed_version = 0; if (auth) state_set(CINODE_STATE_AUTH); } @@ -227,7 +233,7 @@ void CInode::mark_dirty() { assert(is_auth()); // touch my private version - version++; + inode.version++; if (!(state & CINODE_STATE_DIRTY)) { state |= CINODE_STATE_DIRTY; get(CINODE_PIN_DIRTY); @@ -243,6 +249,14 @@ void CInode::mark_dirty() { } } +void CInode::mark_clean() +{ + dout(10) << " mark_clean " << *this << endl; + if (state & CINODE_STATE_DIRTY) { + state &= ~CINODE_STATE_DIRTY; + put(CINODE_PIN_DIRTY); + } +} // state diff --git a/ceph/mds/CInode.h b/ceph/mds/CInode.h index 0992dfd589754..3d754ad9c4fbc 100644 --- a/ceph/mds/CInode.h +++ b/ceph/mds/CInode.h @@ -19,8 +19,6 @@ #include "config.h" #include "include/types.h" #include "include/lru.h" -#include "common/DecayCounter.h" -//#include #include "CDentry.h" #include "Lock.h" @@ -158,6 +156,7 @@ class MDS; class Message; class CInode; class CInodeDiscover; +class MDCache; //class MInodeSyncStart; @@ -170,6 +169,8 @@ extern int cinode_pins[CINODE_NUM_PINS]; // counts // cached inode wrapper class CInode : public LRUObject { public: + MDCache *mdcache; + inode_t inode; // the inode itself CDir *dir; // directory, if we have it opened. @@ -182,8 +183,9 @@ class CInode : public LRUObject { protected: int ref; // reference count set ref_set; - version_t version; - version_t parent_dir_version; // dir version when last touched. + version_t parent_dir_version; // parent dir version when i was last touched. + version_t committing_version; + version_t committed_version; unsigned state; @@ -233,7 +235,7 @@ class CInode : public LRUObject { public: // --------------------------- - CInode(bool auth=true); + CInode(MDCache *c, bool auth=true); ~CInode(); @@ -298,24 +300,25 @@ class CInode : public LRUObject { // -- dirtyness -- - version_t get_version() { return version; } + version_t get_version() { return inode.version; } version_t get_parent_dir_version() { return parent_dir_version; } void float_parent_dir_version(version_t ge) { if (parent_dir_version < ge) parent_dir_version = ge; } - + version_t get_committing_version() { return committing_version; } + version_t get_last_committed_version() { return committed_version; } + void set_committing_version(version_t v) { committing_version = v; } + void set_committed_version() { + committed_version = committing_version; + committing_version = 0; + } + bool is_dirty() { return state & CINODE_STATE_DIRTY; } bool is_clean() { return !is_dirty(); } void mark_dirty(); - void mark_clean() { - dout(10) << " mark_clean " << *this << endl; - if (state & CINODE_STATE_DIRTY) { - state &= ~CINODE_STATE_DIRTY; - put(CINODE_PIN_DIRTY); - } - } + void mark_clean(); @@ -642,7 +645,6 @@ class CInodeExport { struct { inode_t inode; - version_t version; meta_load_t popularity_justme; meta_load_t popularity_curdom; bool is_dirty; // dirty inode? @@ -661,7 +663,6 @@ public: CInodeExport() {} CInodeExport(CInode *in) { st.inode = in->inode; - st.version = in->get_version(); st.is_dirty = in->is_dirty(); cached_by = in->cached_by; cached_by_nonce = in->cached_by_nonce; @@ -686,8 +687,6 @@ public: void update_inode(CInode *in, set& new_client_caps) { in->inode = st.inode; - in->version = st.version; - in->popularity[MDS_POP_JUSTME] += st.popularity_justme; in->popularity[MDS_POP_CURDOM] += st.popularity_curdom; in->popularity[MDS_POP_ANYDOM] += st.popularity_curdom; diff --git a/ceph/mds/LogEvent.cc b/ceph/mds/LogEvent.cc index 1c99ddda29527..5b15f487d77ab 100644 --- a/ceph/mds/LogEvent.cc +++ b/ceph/mds/LogEvent.cc @@ -22,6 +22,8 @@ #include "events/EUnlink.h" #include "events/EAlloc.h" #include "events/EMknod.h" +#include "events/EMkdir.h" +#include "events/EPurgeFinish.h" LogEvent *LogEvent::decode(bufferlist& bl) { @@ -54,6 +56,10 @@ LogEvent *LogEvent::decode(bufferlist& bl) case EVENT_UNLINK: le = new EUnlink(); break; + + case EVENT_PURGEFINISH: + le = new EPurgeFinish(); + break; case EVENT_ALLOC: le = new EAlloc(); @@ -63,6 +69,10 @@ LogEvent *LogEvent::decode(bufferlist& bl) le = new EMknod(); break; + case EVENT_MKDIR: + le = new EMkdir(); + break; + default: dout(1) << "uh oh, unknown event type " << type << endl; assert(0); diff --git a/ceph/mds/LogEvent.h b/ceph/mds/LogEvent.h index d54b502e3083e..0de268252036a 100644 --- a/ceph/mds/LogEvent.h +++ b/ceph/mds/LogEvent.h @@ -15,11 +15,18 @@ #define __LOGEVENT_H #define EVENT_STRING 1 + #define EVENT_INODEUPDATE 2 #define EVENT_DIRUPDATE 3 -#define EVENT_UNLINK 4 -#define EVENT_ALLOC 5 -#define EVENT_MKNOD 6 + +#define EVENT_ALLOC 10 +#define EVENT_MKNOD 11 +#define EVENT_MKDIR 12 +#define EVENT_LINK 13 + +#define EVENT_UNLINK 20 +#define EVENT_RMDIR 21 +#define EVENT_PURGEFINISH 22 #include diff --git a/ceph/mds/MDCache.cc b/ceph/mds/MDCache.cc index 109e7ed4fb474..02e2a9cd1417d 100644 --- a/ceph/mds/MDCache.cc +++ b/ceph/mds/MDCache.cc @@ -38,9 +38,8 @@ #include "osdc/Filer.h" -#include "events/EInodeUpdate.h" -#include "events/EDirUpdate.h" #include "events/EUnlink.h" +#include "events/EPurgeFinish.h" #include "messages/MGenericMessage.h" #include "messages/MDiscover.h" @@ -96,10 +95,11 @@ MDCache::MDCache(MDS *m) MDCache::~MDCache() { + delete migrator; + delete renamer; } - void MDCache::log_stat(Logger *logger) { if (get_root()) { @@ -132,7 +132,7 @@ bool MDCache::shutdown() CInode *MDCache::create_inode() { - CInode *in = new CInode; + CInode *in = new CInode(this); // zero memset(&in->inode, 0, sizeof(inode_t)); @@ -201,6 +201,100 @@ void MDCache::rename_file(CDentry *srcdn, +void MDCache::set_root(CInode *in) +{ + assert(root == 0); + root = in; + root->state_set(CINODE_STATE_ROOT); +} + +void MDCache::add_import(CDir *dir) +{ + imports.insert(dir); + dir->state_set(CDIR_STATE_IMPORT); + dir->get(CDIR_PIN_IMPORT); +} + + + + + +// ************** +// Inode purging -- reliably removing deleted file's objects + +class C_MDC_PurgeFinish : public Context { + MDCache *mdc; + inodeno_t ino; +public: + C_MDC_PurgeFinish(MDCache *c, inodeno_t i) : mdc(c), ino(i) {} + void finish(int r) { + mdc->purge_inode_finish(ino); + } +}; +class C_MDC_PurgeFinish2 : public Context { + MDCache *mdc; + inodeno_t ino; +public: + C_MDC_PurgeFinish2(MDCache *c, inodeno_t i) : mdc(c), ino(i) {} + void finish(int r) { + mdc->purge_inode_finish_2(ino); + } +}; + +/* purge_inode in + * will be called by on unlink or rmdir + * caller responsible for journaling an appropriate EUnlink or ERmdir + */ +void MDCache::purge_inode(inode_t &inode) +{ + dout(10) << "purge_inode " << inode.ino << " size " << inode.size << endl; + + // take note + assert(purging.count(inode.ino) == 0); + purging[inode.ino] = inode; + + // remove + mds->filer->remove(inode, 0, inode.size, + 0, new C_MDC_PurgeFinish(this, inode.ino)); +} + +void MDCache::purge_inode_finish(inodeno_t ino) +{ + dout(10) << "purge_inode_finish " << ino << " - logging our completion" << endl; + + // log completion + mds->mdlog->submit_entry(new EPurgeFinish(ino), + new C_MDC_PurgeFinish2(this, ino)); +} + +void MDCache::purge_inode_finish_2(inodeno_t ino) +{ + dout(10) << "purge_inode_finish_2 " << ino << endl; + + // remove from purging list + purging.erase(ino); + + // tell anyone who cares (log flusher?) + list ls; + ls.swap(waiting_for_purge[ino]); + waiting_for_purge.erase(ino); + finish_contexts(ls, 0); + + // reclaim ino? + +} + +void MDCache::start_recovered_purges() +{ + for (map::iterator p = purging.begin(); + p != purging.end(); + ++p) { + dout(10) << "start_recovered_purges " << p->first << " size " << p->second.size << endl; + mds->filer->remove(p->second, 0, p->second.size, + 0, new C_MDC_PurgeFinish(this, p->first)); + } +} + @@ -529,7 +623,7 @@ int MDCache::open_root(Context *c) // open root inode if (whoami == 0) { // i am root inode - CInode *root = new CInode(); + CInode *root = new CInode(this); memset(&root->inode, 0, sizeof(inode_t)); root->inode.ino = 1; root->inode.hash_seed = 0; // not hashed! @@ -543,9 +637,8 @@ int MDCache::open_root(Context *c) root->inode.nlink = 1; root->inode.layout = g_OSD_MDDirLayout; - root->state_set(CINODE_STATE_ROOT); - set_root( root ); + add_inode( root ); // root directory too assert(root->dir == NULL); @@ -1682,13 +1775,13 @@ void MDCache::handle_discover_reply(MDiscoverReply *m) assert(!m->has_base_dir()); // add in root - cur = new CInode(false); + cur = new CInode(this, false); m->get_inode(0).update_inode(cur); // root - cur->state_set(CINODE_STATE_ROOT); set_root( cur ); + add_inode( cur ); dout(7) << " got root: " << *cur << endl; // take waiters @@ -1814,7 +1907,7 @@ void MDCache::handle_discover_reply(MDiscoverReply *m) assert(dn->inode == 0); // better not be something else linked to this dentry... // didn't have it. - in = new CInode(false); + in = new CInode(this, false); m->get_inode(i).update_inode(in); @@ -2136,7 +2229,7 @@ void MDCache::dentry_unlink(CDentry *dn, Context *c) // log it if (dn->inode) dn->inode->mark_unsafe(); // XXX ??? FIXME - mds->mdlog->submit_entry(new EUnlink(dir, dn), + mds->mdlog->submit_entry(new EUnlink(dir, dn, dn->inode), NULL); // FIXME FIXME FIXME // tell replicas diff --git a/ceph/mds/MDCache.h b/ceph/mds/MDCache.h index 8f55bcf50b235..e62113312447f 100644 --- a/ceph/mds/MDCache.h +++ b/ceph/mds/MDCache.h @@ -100,12 +100,17 @@ class MDCache { // active MDS requests hash_map active_requests; + + // inode purging + map purging; + map > waiting_for_purge; // shutdown crap int shutdown_commits; bool did_shutdown_exports; friend class C_MDC_ShutdownCommit; + friend class CInode; friend class Locker; friend class Migrator; friend class Renamer; @@ -125,10 +130,10 @@ class MDCache { // root inode CInode *get_root() { return root; } - void set_root(CInode *r) { - root = r; - add_inode(root); - } + void set_root(CInode *r); + + void add_import(CDir *dir); + void remove_import(CDir *dir); // cache void set_cache_size(size_t max) { lru.lru_set_max(max); } @@ -168,6 +173,15 @@ class MDCache { } void rename_file(CDentry *srcdn, CDentry *destdn); + public: + // inode purging + void purge_inode(inode_t& inode); + void purge_inode_finish(inodeno_t ino); + void purge_inode_finish_2(inodeno_t ino); + void waitfor_purge(inodeno_t ino, Context *c); + void start_recovered_purges(); + + protected: // private methods CDir *get_auth_container(CDir *in); diff --git a/ceph/mds/MDLog.cc b/ceph/mds/MDLog.cc index da9451be8d466..b272eb9a176d6 100644 --- a/ceph/mds/MDLog.cc +++ b/ceph/mds/MDLog.cc @@ -97,21 +97,21 @@ void MDLog::write_head(Context *c) } -void MDLog::submit_entry( LogEvent *e, +void MDLog::submit_entry( LogEvent *le, Context *c ) { - dout(5) << "submit_entry " << journaler->get_write_pos() << " : " << *e << endl; + dout(5) << "submit_entry " << journaler->get_write_pos() << " : " << *le << endl; if (g_conf.mds_log) { // encode it, with event type bufferlist bl; - bl.append((char*)&e->_type, sizeof(e->_type)); - e->encode_payload(bl); + bl.append((char*)&le->_type, sizeof(le->_type)); + le->encode_payload(bl); // journal it. journaler->append_entry(bl); - delete e; + delete le; num_events++; logger->inc("add"); @@ -201,7 +201,7 @@ void MDLog::_trimmed(LogEvent *le) assert(le->can_expire(mds)); if (trimming.begin()->first == le->_end_off) { - // front! we can expire log a bit + // front! we can expire the log a bit journaler->set_expire_pos(le->_end_off); } diff --git a/ceph/mds/MDS.cc b/ceph/mds/MDS.cc index fe921e12535e3..a487d6469eb7a 100644 --- a/ceph/mds/MDS.cc +++ b/ceph/mds/MDS.cc @@ -316,7 +316,15 @@ void MDS::boot_recover(int step) switch (step) { case 0: - step = 1; + if (whoami == 0) { + dout(2) << "boot_recover " << step << ": creating root inode" << endl; + mdcache->open_root(0); + step = 1; + // fall-thru + } else { + // FIXME + assert(0); + } case 1: dout(2) << "boot_recover " << step << ": opening idalloc" << endl; @@ -345,6 +353,12 @@ void MDS::boot_recover(int step) break; case 5: + dout(2) << "boot_recover " << step << ": restarting any recovered purges" << endl; + mdcache->start_recovered_purges(); + step++; + // fall-thru + + case 6: dout(2) << "boot_recover " << step << ": done." << endl; mark_active(); } diff --git a/ceph/mds/MDS.h b/ceph/mds/MDS.h index b46e09b8d8ca5..1581d9c4049ca 100644 --- a/ceph/mds/MDS.h +++ b/ceph/mds/MDS.h @@ -24,7 +24,6 @@ using namespace std; #include -#include using namespace __gnu_cxx; #include "msg/Dispatcher.h" @@ -250,7 +249,4 @@ public: ostream& operator<<(ostream& out, MDS& mds); -//extern MDS *g_mds; - - #endif diff --git a/ceph/mds/MDStore.cc b/ceph/mds/MDStore.cc index 644c90dcf1545..432d56751b643 100644 --- a/ceph/mds/MDStore.cc +++ b/ceph/mds/MDStore.cc @@ -33,7 +33,7 @@ using namespace std; #include "config.h" #undef dout -#define dout(l) if (l<=g_conf.debug) cout << "mds" << mds->get_nodeid() << ".store " +#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_mds) cout << g_clock.now() << " mds" << mds->get_nodeid() << ".store " /* @@ -235,42 +235,49 @@ void MDStore::fetch_dir_hash_2( bufferlist& bl, // parse buffer contents into cache dout(15) << "bl is " << bl << endl; + + int off = 0; size_t size; - bl.copy(0, sizeof(size), (char*)&size); + __uint32_t num; + version_t got_version; + int got_hashcode; + bl.copy(off, sizeof(size), (char*)&size); + off += sizeof(size); assert(bl.length() >= size + sizeof(size)); + bl.copy(off, sizeof(num), (char*)&num); + off += sizeof(num); + bl.copy(off, sizeof(got_version), (char*)&got_version); + off += sizeof(got_version); + bl.copy(off, sizeof(got_hashcode), (char*)&got_hashcode); + off += sizeof(got_hashcode); + + assert(got_hashcode == hashcode); - int n; - bl.copy(sizeof(size), sizeof(n), (char*)&n); - - char *buffer = bl.c_str(); // contiguous ptr to whole buffer(list) - size_t buflen = bl.length(); - size_t p = sizeof(size_t); - - __uint32_t num = *(__uint32_t*)(buffer + p); - p += sizeof(num); + int buflen = bl.length(); dout(10) << " " << num << " items in " << size << " bytes" << endl; unsigned parsed = 0; while (parsed < num) { - assert(p < buflen && num > 0); + assert(off < buflen && num > 0); parsed++; - dout(24) << " " << parsed << "/" << num << " pos " << p-8 << endl; + dout(24) << " " << parsed << "/" << num << " pos " << off << endl; // dentry - string dname = buffer+p; - p += dname.length() + 1; + string dname; + ::_decode(dname, bl, off); dout(24) << "parse filename '" << dname << "'" << endl; CDentry *dn = dir->lookup(dname); // existing dentry? - if (*(buffer+p) == 'L') { - // hard link, we don't do that yet. - p++; - - inodeno_t ino = *(inodeno_t*)(buffer+p); - p += sizeof(ino); + char type = bl[off]; + ++off; + if (type == 'L') { + // hard link + inodeno_t ino; + bl.copy(off, sizeof(ino), (char*)&ino); + off += sizeof(ino); // what to do? if (hashcode >= 0) { @@ -301,20 +308,18 @@ void MDStore::fetch_dir_hash_2( bufferlist& bl, dout(12) << "readdir got remote link " << ino << " (dont' have it)" << endl; } } - else if (*(buffer+p) == 'I') { + else if (type == 'I') { // inode - p++; // parse out inode - inode_t *inode = (inode_t*)(buffer+p); - p += sizeof(inode_t); + inode_t inode; + bl.copy(off, sizeof(inode), (char*)&inode); + off += sizeof(inode); string symlink; - if ((inode->mode & INODE_TYPE_MASK) == INODE_MODE_SYMLINK) { - symlink = (char*)(buffer+p); - p += symlink.length() + 1; - } - + if (inode.is_symlink()) + ::_decode(symlink, bl, off); + // what to do? if (hashcode >= 0) { int dentryhashcode = mds->hash_dentry( dir->ino(), dname ); @@ -328,19 +333,28 @@ void MDStore::fetch_dir_hash_2( bufferlist& bl, } else { // had dentry dout(12) << "readdir had dentry " << dname << endl; + + // under water? + if (dn->get_inode()->get_parent_dir_version() <= got_version) { + dout(10) << "readdir had underwater dentry " << dname << " and inode, marking clean" << endl; + dn->get_inode()->mark_clean(); + dn->mark_clean(); + } } continue; } // add inode CInode *in = 0; - if (mds->mdcache->have_inode(inode->ino)) { - in = mds->mdcache->get_inode(inode->ino); - dout(12) << "readdir got (but i already had) " << *in << " mode " << in->inode.mode << " mtime " << in->inode.mtime << endl; + if (mds->mdcache->have_inode(inode.ino)) { + in = mds->mdcache->get_inode(inode.ino); + dout(12) << "readdir got (but i already had) " << *in + << " mode " << in->inode.mode + << " mtime " << in->inode.mtime << endl; } else { // inode - in = new CInode(); - memcpy(&in->inode, inode, sizeof(inode_t)); + in = new CInode(mds->mdcache); + in->inode = inode; // symlink? if (in->is_symlink()) { @@ -356,7 +370,8 @@ void MDStore::fetch_dir_hash_2( bufferlist& bl, dout(12) << "readdir got " << *in << " mode " << in->inode.mode << " mtime " << in->inode.mtime << endl; } else { - dout(1) << "corrupt directory, i got tag char '" << *(buffer+p) << "' val " << (int)(*(buffer+p)) << " at pos " << p << endl; + dout(1) << "corrupt directory, i got tag char '" << type << "' val " << (int)(type) + << " at pos " << off << endl; assert(0); } } @@ -378,12 +393,12 @@ class C_MDS_CommitDirVerify : public Context { public: MDS *mds; inodeno_t ino; - __uint64_t version; + version_t version; Context *c; C_MDS_CommitDirVerify( MDS *mds, inodeno_t ino, - __uint64_t version, + version_t version, Context *c) { this->mds = mds; this->c = c; @@ -428,7 +443,7 @@ class C_MDS_CommitDirFinish : public Context { protected: MDStore *ms; CDir *dir; - __uint64_t version; + version_t version; public: @@ -454,7 +469,7 @@ void MDStore::commit_dir( CDir *dir, } void MDStore::commit_dir( CDir *dir, - __uint64_t version, + version_t version, Context *c ) { assert(dir->is_auth() || @@ -520,7 +535,7 @@ void MDStore::commit_dir( CDir *dir, void MDStore::commit_dir_2( int result, CDir *dir, - __uint64_t committed_version) + version_t committed_version) { dout(5) << "commit_dir_2 " << *dir << " committed " << committed_version << ", current version " << dir->get_version() << endl; assert(committed_version == dir->get_committing_version()); @@ -549,7 +564,7 @@ class C_MDS_CommitSlice : public Context { CDir *dir; Context *c; int hashcode; - __uint64_t version; + version_t version; public: bufferlist bl; @@ -587,6 +602,10 @@ void MDStore::commit_dir_slice( CDir *dir, __uint32_t num = 0; bufferlist dirdata; + + version_t v = dir->get_version(); + dirdata.append((char*)&v, sizeof(v)); + dirdata.append((char*)&hashcode, sizeof(hashcode)); for (CDir_map_t::iterator it = dir->begin(); it != dir->end(); @@ -639,7 +658,13 @@ void MDStore::commit_dir_slice( CDir *dir, if (in->is_dirty()) { in->float_parent_dir_version( dir->get_version() ); dout(12) << " dirty inode " << *in << " now " << in->get_parent_dir_version() << endl; + + in->set_committing_version( in->get_version() ); + assert(in->get_last_committed_version() < in->get_committing_version()); + } else { + assert(in->get_committing_version() == in->get_version()); } + } num++; @@ -669,7 +694,7 @@ void MDStore::commit_dir_slice( CDir *dir, void MDStore::commit_dir_slice_2( int result, CDir *dir, Context *c, - __uint64_t committed_version, + version_t committed_version, int hashcode ) { dout(11) << "commit_dir_slice_2 hashcode " << hashcode << " " << *dir << " v " << committed_version << endl; @@ -715,6 +740,9 @@ void MDStore::commit_dir_slice_2( int result, assert(in); assert(in->is_auth()); + if (in->get_committing_version()) + in->set_committed_version(); + if (committed_version > in->get_parent_dir_version()) { dout(15) << " dir " << committed_version << " > inode " << in->get_parent_dir_version() << " still clean " << *(in) << endl; assert(!in->is_dirty()); diff --git a/ceph/mds/Migrator.cc b/ceph/mds/Migrator.cc index 7ae443cd936d7..bde26ae72dced 100644 --- a/ceph/mds/Migrator.cc +++ b/ceph/mds/Migrator.cc @@ -513,7 +513,7 @@ void Migrator::export_dir_go(CDir *dir, */ void Migrator::encode_export_inode(CInode *in, bufferlist& enc_state, int new_auth) { - in->version++; // so local log entries are ignored, etc. (FIXME ??) + in->inode.version++; // so local log entries are ignored, etc. (FIXME ??) // tell (all) clients about migrating caps.. mark STALE for (map::iterator it = in->client_caps.begin(); @@ -960,7 +960,7 @@ void Migrator::handle_export_dir_prep(MExportDirPrep *m) (*it)->update_inode(in); dout(7) << " updated " << *in << endl; } else { - in = new CInode(false); + in = new CInode(mds->mdcache, false); (*it)->update_inode(in); // link to the containing dir @@ -1300,7 +1300,7 @@ void Migrator::decode_import_inode(CDentry *dn, bufferlist& bl, int& off, int ol bool added = false; CInode *in = cache->get_inode(istate.get_ino()); if (!in) { - in = new CInode; + in = new CInode(mds->mdcache); added = true; } else { in->set_auth(true); @@ -2406,7 +2406,7 @@ void Migrator::handle_hash_dir_prep(MHashDirPrep *m) it->second->update_inode(in); dout(5) << " updated " << *in << endl; } else { - in = new CInode(false); + in = new CInode(mds->mdcache, false); it->second->update_inode(in); cache->add_inode(in); @@ -2637,7 +2637,7 @@ void Migrator::handle_unhash_dir_prep_ack(MUnhashDirPrepAck *m) it->second->update_inode(in); dout(5) << " updated " << *in << endl; } else { - in = new CInode(false); + in = new CInode(mds->mdcache, false); it->second->update_inode(in); cache->add_inode(in); diff --git a/ceph/mds/Server.cc b/ceph/mds/Server.cc index cce2fa3a2ac3d..577b300acb2e3 100644 --- a/ceph/mds/Server.cc +++ b/ceph/mds/Server.cc @@ -36,6 +36,7 @@ #include "events/EInodeUpdate.h" #include "events/EDirUpdate.h" #include "events/EMknod.h" +#include "events/EMkdir.h" #include "include/filepath.h" #include "common/Timer.h" @@ -49,6 +50,10 @@ #include using namespace std; +#include "config.h" +#undef dout +#define dout(l) if (l<=g_conf.debug || l <= g_conf.debug_mds) cout << g_clock.now() << " mds" << whoami << ".server " +#define derr(l) if (l<=g_conf.debug || l <= g_conf.debug_mds) cout << g_clock.now() << " mds" << whoami << ".server " void Server::dispatch(Message *m) @@ -332,14 +337,30 @@ void Server::handle_client_request(MClientRequest *req) MSG_ADDR_CLIENT(req->get_client()), req->get_client_inst()); // - if (refpath.last_bit() == ".hash" && - refpath.depth() > 1) { - dout(1) << "got explicit hash command " << refpath << endl; - CDir *dir = trace[trace.size()-1]->get_inode()->dir; - if (!dir->is_hashed() && - !dir->is_hashing() && - dir->is_auth()) - mdcache->migrator->hash_dir(dir); + // is this a special debug command? + if (refpath.depth() - 1 == trace.size() && + refpath.last_bit().find(".ceph.") == 0) { + CDir *dir = 0; + if (trace.empty()) + dir = mdcache->get_root()->dir; + else + dir = trace[trace.size()-1]->get_inode()->dir; + + dout(1) << "** POSSIBLE CEPH DEBUG COMMAND '" << refpath.last_bit() << "' in " << *dir << endl; + + if (refpath.last_bit() == ".ceph.hash" && + refpath.depth() > 1) { + dout(1) << "got explicit hash command " << refpath << endl; + CDir *dir = trace[trace.size()-1]->get_inode()->dir; + if (!dir->is_hashed() && + !dir->is_hashing() && + dir->is_auth()) + mdcache->migrator->hash_dir(dir); + } + else if (refpath.last_bit() == ".ceph.commit") { + dout(1) << "got explicit commit command on " << *dir << endl; + mds->mdstore->commit_dir(dir, 0); + } } // @@ -473,7 +494,7 @@ void Server::dispatch_request(Message *m, CInode *ref) // STAT void Server::handle_client_stat(MClientRequest *req, - CInode *ref) + CInode *ref) { // do I need file info? int mask = req->get_iarg(); @@ -913,7 +934,7 @@ void Server::handle_client_mknod(MClientRequest *req, CInode *ref) // commit commit_request(req, new MClientReply(req, 0), ref, - new EInodeUpdate(newi)); // FIXME this is the wrong message + new EMknod(newi)); } // mknod(): used by handle_client_mkdir, handle_client_mknod, which are mostly identical. @@ -1016,7 +1037,7 @@ CInode *Server::mknod(MClientRequest *req, CInode *diri, bool okexist) newi->mark_dirty(); // journal it - mdlog->submit_entry(new EMknod(newi)); + //mdlog->submit_entry(new EMknod(newi)); // ok! return newi; @@ -1066,6 +1087,8 @@ void Server::handle_client_link(MClientRequest *req, CInode *ref) CDir *dir = ref->dir; dout(7) << "handle_client_link dir is " << *dir << endl; + + // make sure it's my dentry int dauth = dir->dentry_authority(dname); if (dauth != whoami) { @@ -1974,8 +1997,9 @@ void Server::handle_client_mkdir(MClientRequest *req, CInode *diri) // commit to log commit_request(req, new MClientReply(req, 0), diri, - new EInodeUpdate(newi),//); - new EDirUpdate(newdir)); // FIXME: weird performance regression here w/ double log; somewhat of a mystery! + new EMkdir(newdir)); + //new EInodeUpdate(newi),//); + //new EDirUpdate(newdir)); // FIXME: weird performance regression here w/ double log; somewhat of a mystery! return; } diff --git a/ceph/mds/Server.h b/ceph/mds/Server.h index fdba562c93ebf..912af31ca909a 100644 --- a/ceph/mds/Server.h +++ b/ceph/mds/Server.h @@ -27,6 +27,7 @@ class Server { __uint64_t stat_ops; + public: Server(MDS *m) : mds(m), diff --git a/ceph/mds/events/EDirUpdate.h b/ceph/mds/events/EDirUpdate.h index 5950450fc4b48..9c8881d4c91b9 100644 --- a/ceph/mds/events/EDirUpdate.h +++ b/ceph/mds/events/EDirUpdate.h @@ -17,40 +17,43 @@ #include #include "config.h" #include "include/types.h" -#include "ETraced.h" + #include "../LogEvent.h" +#include "ETrace.h" #include "../CDir.h" #include "../MDCache.h" #include "../MDStore.h" -class EDirUpdate : public ETraced { +class EDirUpdate : public LogEvent { protected: + ETrace trace; inodeno_t dirino; version_t version; public: - EDirUpdate(CDir *dir) : ETraced(EVENT_DIRUPDATE, dir->inode) { + EDirUpdate(CDir *dir) : LogEvent(EVENT_DIRUPDATE), + trace(dir->inode) { this->dirino = dir->ino(); version = dir->get_version(); } - EDirUpdate() : ETraced(EVENT_DIRUPDATE) { + EDirUpdate() : LogEvent(EVENT_DIRUPDATE) { } void print(ostream& out) { - out << "up dir " << dirino << " "; - ETraced::print(out); - out << "/ v " << version; + out << "up dir " << dirino << " " + << trace + << "/ v " << version; } virtual void encode_payload(bufferlist& bl) { - encode_trace(bl); + trace.encode(bl); bl.append((char*)&version, sizeof(version)); bl.append((char*)&dirino, sizeof(dirino)); } void decode_payload(bufferlist& bl, int& off) { - decode_trace(bl, off); + trace.decode(bl, off); bl.copy(off, sizeof(version), (char*)&version); off += sizeof(version); bl.copy(off, sizeof(dirino), (char*)&dirino); diff --git a/ceph/mds/events/EInodeUpdate.h b/ceph/mds/events/EInodeUpdate.h index 3cf3cd4376387..dba233c833883 100644 --- a/ceph/mds/events/EInodeUpdate.h +++ b/ceph/mds/events/EInodeUpdate.h @@ -18,85 +18,38 @@ #include "config.h" #include "include/types.h" -#include "ETraced.h" -#include "../MDStore.h" +#include "../LogEvent.h" +#include "ETrace.h" - -class EInodeUpdate : public ETraced { +class EInodeUpdate : public LogEvent { protected: - inode_t inode; + ETrace trace; public: - EInodeUpdate(CInode *in) : ETraced(EVENT_INODEUPDATE, in) { - this->inode = in->get_inode(); + EInodeUpdate(CInode *in) : LogEvent(EVENT_INODEUPDATE), + trace(in) { } - EInodeUpdate() : ETraced(EVENT_INODEUPDATE) { } + EInodeUpdate() : LogEvent(EVENT_INODEUPDATE) { } void print(ostream& out) { - out << "up inode " << inode.ino << " "; - ETraced::print(out); - out << " v " << inode.version; + out << "up inode " << trace.back().inode.ino + << " " << trace + << " v " << trace.back().inode.version; } - + virtual void encode_payload(bufferlist& bl) { - encode_trace(bl); - bl.append((char*)&inode, sizeof(inode)); + trace.encode(bl); } void decode_payload(bufferlist& bl, int& off) { - decode_trace(bl, off); - bl.copy(off, sizeof(inode), (char*)&inode); - off += sizeof(inode); + trace.decode(bl, off); } - - bool can_expire(MDS *mds) { - // am i obsolete? - CInode *in = mds->mdcache->get_inode(inode.ino); - - //assert(in); - if (!in) { - dout(7) << "inode " << inode.ino << " not in cache, must have exported" << endl; - return true; - } - dout(7) << "EInodeUpdate obsolete? on " << *in << endl; - if (!in->is_auth()) - return true; // not my inode anymore! - if (in->get_version() != inode.version) - return true; // i'm obsolete! (another log entry follows) - - CDir *parent = in->get_parent_dir(); - if (!parent) return true; // root? - if (!parent->is_dirty()) return true; // dir is clean! - - // frozen -> exporting -> obsolete (FOR NOW?) - if (in->is_frozen()) - return true; + bool can_expire(MDS *mds); + void retire(MDS *mds, Context *c); + bool has_happened(MDS *mds); + void replay(MDS *mds); - return false; - } - - virtual void retire(MDS *mds, Context *c) { - // commit my containing directory - CInode *in = mds->mdcache->get_inode(inode.ino); - assert(in); - CDir *parent = in->get_parent_dir(); - - if (parent) { - // okay! - dout(7) << "commiting containing dir for " << *in << ", which is " << *parent << endl; - mds->mdstore->commit_dir(parent, c); - } else { - // oh, i'm the root inode - dout(7) << "don't know how to commit the root inode" << endl; - if (c) { - c->finish(0); - delete c; - } - } - - } - }; #endif diff --git a/ceph/mds/events/EMkdir.h b/ceph/mds/events/EMkdir.h new file mode 100644 index 0000000000000..f7f9c05c2207c --- /dev/null +++ b/ceph/mds/events/EMkdir.h @@ -0,0 +1,62 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef __EMKDIR_H +#define __EMKDIR_H + +#include +#include "config.h" +#include "include/types.h" + +#include "ETrace.h" +#include "../MDS.h" +#include "../MDStore.h" + + +class EMkdir : public LogEvent { + protected: + ETrace trace; + //version_t pdirv; + + public: + EMkdir(CDir *dir) : LogEvent(EVENT_MKDIR), + trace(dir->inode) { + //pdirv = dir->inode->get_parent_dir()->get_version(); + } + EMkdir() : LogEvent(EVENT_MKDIR) { } + + void print(ostream& out) { + out << "mkdir "; + trace.print(out); + } + + virtual void encode_payload(bufferlist& bl) { + trace.encode(bl); + //bl.append((char*)&pdirv, sizeof(pdirv)); + } + void decode_payload(bufferlist& bl, int& off) { + trace.decode(bl, off); + //bl.copy(off, sizeof(pdirv), (char*)&pdirv); + //off += sizeof(pdirv); + } + + bool can_expire(MDS *mds); + void retire(MDS *mds, Context *c); + + // recovery + bool has_happened(MDS *mds); + void replay(MDS *mds); + +}; + +#endif diff --git a/ceph/mds/events/EMknod.h b/ceph/mds/events/EMknod.h index 6d43fa7955b0a..27ade4671a0c7 100644 --- a/ceph/mds/events/EMknod.h +++ b/ceph/mds/events/EMknod.h @@ -18,58 +18,43 @@ #include "config.h" #include "include/types.h" -#include "ETraced.h" +#include "../LogEvent.h" +#include "ETrace.h" +#include "../MDS.h" #include "../MDStore.h" -class EMknod : public ETraced { +class EMknod : public LogEvent { protected: + ETrace trace; + //version_t pdirv; + public: - EMknod(CInode *in) : ETraced(EVENT_MKNOD, in) { + EMknod(CInode *in) : LogEvent(EVENT_MKNOD), + trace(in) { + //pdirv = in->get_parent_dir()->get_version(); } - EMknod() : ETraced(EVENT_MKNOD) { } + EMknod() : LogEvent(EVENT_MKNOD) { } void print(ostream& out) { - out << "mknod "; - ETraced::print(out); + out << "mknod " << trace; } virtual void encode_payload(bufferlist& bl) { - encode_trace(bl); + trace.encode(bl); + //bl.append((char*)&pdirv, sizeof(pdirv)); } void decode_payload(bufferlist& bl, int& off) { - decode_trace(bl, off); - } - - bool can_expire(MDS *mds) { - // am i obsolete? - CInode *diri = mds->mdcache->get_inode( trace.back().dirino ); - if (!diri) return true; - CDir *dir = diri->dir; - if (!dir) return true; + trace.decode(bl, off); + //bl.copy(off, sizeof(pdirv), (char*)&pdirv); + //off += sizeof(pdirv); + } - if (!dir->is_auth()) return true; // not mine! - if (dir->is_frozen()) return true; // frozen -> exporting -> obsolete? FIXME - - if (!dir->is_dirty()) return true; + bool can_expire(MDS *mds); + void retire(MDS *mds, Context *c); + bool has_happened(MDS *mds); + void replay(MDS *mds); - if (dir->get_committing_version() > trace.back().dirv) - return true; - - return false; - } - - virtual void retire(MDS *mds, Context *c) { - // commit directory - CInode *in = mds->mdcache->get_inode( trace.back().dirino ); - assert(in); - CDir *dir = in->dir; - assert(dir); - - dout(10) << "EMknod committing dir " << *dir << endl; - mds->mdstore->commit_dir(dir, c); - } - }; #endif diff --git a/ceph/mds/events/EPurgeFinish.h b/ceph/mds/events/EPurgeFinish.h new file mode 100644 index 0000000000000..bacfa8e93c737 --- /dev/null +++ b/ceph/mds/events/EPurgeFinish.h @@ -0,0 +1,49 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef __EPURGE_H +#define __EPURGE_H + +#include +#include "config.h" +#include "include/types.h" + +class EPurgeFinish : public LogEvent { + protected: + inodeno_t ino; + + public: + EPurgeFinish(inodeno_t i) : + LogEvent(EVENT_PURGEFINISH), + ino(i) { } + EPurgeFinish() : LogEvent(EVENT_PURGEFINISH) { } + + void print(ostream& out) { + out << "purgefinish " << ino; + } + + virtual void encode_payload(bufferlist& bl) { + bl.append((char*)&ino, sizeof(ino)); + } + void decode_payload(bufferlist& bl, int& off) { + bl.copy(off, sizeof(ino), (char*)&ino); + } + + bool can_expire(MDS *mds); + void retire(MDS *mds, Context *c); + bool has_happened(MDS *mds); + void replay(MDS *mds); + +}; + +#endif diff --git a/ceph/mds/events/ETraced.h b/ceph/mds/events/ETrace.h similarity index 67% rename from ceph/mds/events/ETraced.h rename to ceph/mds/events/ETrace.h index 66f969d06567f..a320137512178 100644 --- a/ceph/mds/events/ETraced.h +++ b/ceph/mds/events/ETrace.h @@ -11,56 +11,51 @@ * */ - -#ifndef __MDS_ETRACED_H -#define __MDS_ETRACED_H +#ifndef __MDS_ETRACE_H +#define __MDS_ETRACE_H #include #include using namespace std; -#include "../LogEvent.h" #include "../CInode.h" #include "../CDir.h" #include "../CDentry.h" -#include "../MDCache.h" -// generic log event -class ETraced : public LogEvent { + +// path trace for use in journal events + +class ETrace { // segment. struct bit { inodeno_t dirino; version_t dirv; string dn; - inodeno_t ino; - version_t inov; + inode_t inode; bit(bufferlist& bl, int& off) { _decode(bl,off); } - bit(inodeno_t di, version_t dv, const string& d, inodeno_t i, version_t iv) : - dirino(di), dirv(dv), dn(d), ino(i), inov(iv) {} + bit(inodeno_t di, version_t dv, const string& d, inode_t i) : + dirino(di), dirv(dv), dn(d), inode(i) {} void _encode(bufferlist& bl) { bl.append((char*)&dirino, sizeof(dirino)); bl.append((char*)&dirv, sizeof(dirv)); ::_encode(dn, bl); - bl.append((char*)&ino, sizeof(ino)); - bl.append((char*)&inov, sizeof(inov)); + bl.append((char*)&inode, sizeof(inode)); } void _decode(bufferlist& bl, int& off) { bl.copy(off, sizeof(dirino), (char*)&dirino); off += sizeof(dirino); bl.copy(off, sizeof(dirv), (char*)&dirv); off += sizeof(dirv); ::_decode(dn, bl, off); - bl.copy(off, sizeof(ino), (char*)&ino); off += sizeof(ino); - bl.copy(off, sizeof(inov), (char*)&inov); off += sizeof(inov); + bl.copy(off, sizeof(inode), (char*)&inode); off += sizeof(inode); } }; - protected: + public: list trace; -public: - ETraced(int t, CInode *in = 0) : LogEvent(t) { + ETrace(CInode *in = 0) { if (in) { CDir *dir; CDentry *dn; @@ -70,16 +65,21 @@ public: dir = dn->get_dir(); if (!dir) break; - trace.push_front(bit(dir->ino(), dir->get_version(), + trace.push_front(bit(dir->ino(), + dir->get_version(), dn->get_name(), - in->ino(), in->get_version())); + in->inode)); in = dir->get_inode(); } while (!dir->is_import()); } } + + bit& back() { + return trace.back(); + } - void decode_trace(bufferlist& bl, int& off) { + void decode(bufferlist& bl, int& off) { int n; bl.copy(off, sizeof(n), (char*)&n); off += sizeof(n); @@ -87,7 +87,7 @@ public: trace.push_back( bit(bl, off) ); } - void encode_trace(bufferlist& bl) { + void encode(bufferlist& bl) { int n = trace.size(); bl.append((char*)&n, sizeof(n)); for (list::iterator i = trace.begin(); @@ -96,8 +96,8 @@ public: i->_encode(bl); } - void print(ostream& out) { - for (list::iterator p = trace.begin(); + void print(ostream& out) const { + for (list::const_iterator p = trace.begin(); p != trace.end(); p++) { if (p == trace.begin()) @@ -106,7 +106,14 @@ public: out << "/" << p->dn; } } - + + CInode *restore_trace(MDS *mds); + }; +inline ostream& operator<<(ostream& out, const ETrace& t) { + t.print(out); + return out; +} + #endif diff --git a/ceph/mds/events/EUnlink.h b/ceph/mds/events/EUnlink.h index ccea623bfebff..9b7484174886a 100644 --- a/ceph/mds/events/EUnlink.h +++ b/ceph/mds/events/EUnlink.h @@ -17,69 +17,48 @@ #include #include "config.h" #include "include/types.h" -#include "../LogEvent.h" -#include "../CInode.h" -#include "../MDCache.h" -#include "../MDStore.h" +#include "../LogEvent.h" +#include "ETrace.h" +#include "../CInode.h" +#include "../CDentry.h" +#include "../CDir.h" class EUnlink : public LogEvent { protected: - inodeno_t dir_ino; - __uint64_t version; + ETrace diritrace; + version_t dirv; string dname; + ETrace inodetrace; public: - EUnlink(CDir *dir, CDentry* dn) : - LogEvent(EVENT_UNLINK) { - this->dir_ino = dir->ino(); - this->dname = dn->get_name(); - this->version = dir->get_version(); - } - EUnlink() : - LogEvent(EVENT_UNLINK) { - } + EUnlink(CDir *dir, CDentry* dn, CInode *in) : + LogEvent(EVENT_UNLINK), + diritrace(dir->inode), + dirv(dir->get_version()), + dname(dn->get_name()), + inodetrace(in) {} + EUnlink() : LogEvent(EVENT_UNLINK) { } virtual void encode_payload(bufferlist& bl) { - bl.append((char*)&dir_ino, sizeof(dir_ino)); - bl.append((char*)&version, sizeof(version)); - bl.append((char*)dname.c_str(), dname.length() + 1); + diritrace.encode(bl); + bl.append((char*)&dirv, sizeof(dirv)); + ::_encode(dname, bl); + inodetrace.encode(bl); } void decode_payload(bufferlist& bl, int& off) { - bl.copy(off, sizeof(dir_ino), (char*)&dir_ino); - off += sizeof(dir_ino); - bl.copy(off, sizeof(version), (char*)&version); - off += sizeof(version); - dname = bl.c_str() + off; - off += dname.length() + 1; + diritrace.decode(bl,off); + bl.copy(off, sizeof(dirv), (char*)&dirv); + off += sizeof(dirv); + ::_decode(dname, bl, off); + inodetrace.decode(bl, off); } - virtual bool can_expire(MDS *mds) { - // am i obsolete? - CInode *idir = mds->mdcache->get_inode(dir_ino); - if (!idir) return true; - - CDir *dir = idir->dir; - - if (!dir) return true; - - if (!idir->dir->is_auth()) return true; - if (idir->dir->is_clean()) return true; - - if (idir->dir->get_last_committed_version() >= version) return true; - return false; - } - - virtual void retire(MDS *mds, Context *c) { - // commit my containing directory - CDir *dir = mds->mdcache->get_inode(dir_ino)->dir; - assert(dir); - - // okay! - dout(7) << "commiting dirty (from unlink) dir " << *dir << endl; - mds->mdstore->commit_dir(dir, version, c); - } + bool can_expire(MDS *mds); + void retire(MDS *mds, Context *c); + bool has_happened(MDS *mds); + void replay(MDS *mds); }; #endif diff --git a/ceph/mds/journal.cc b/ceph/mds/journal.cc new file mode 100644 index 0000000000000..9ac2406e2cbc2 --- /dev/null +++ b/ceph/mds/journal.cc @@ -0,0 +1,345 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "events/ETrace.h" +#include "events/EMknod.h" +#include "events/EMkdir.h" +#include "events/EInodeUpdate.h" +#include "events/EPurgeFinish.h" +#include "events/EUnlink.h" + +#include "MDS.h" +#include "MDCache.h" + +#include "config.h" +#undef dout +#define dout(l) if (l<=g_conf.debug || l <= g_conf.debug_mds) cout << g_clock.now() << " mds" << mds->get_nodeid() << ".journal " +#define derr(l) if (l<=g_conf.debug || l <= g_conf.debug_mds) cout << g_clock.now() << " mds" << mds->get_nodeid() << ".journal " + + +// ----------------------- +// ETrace + +CInode *ETrace::restore_trace(MDS *mds) +{ + CInode *in = 0; + for (list::iterator p = trace.begin(); + p != trace.end(); + ++p) { + // the dir + CInode *diri = mds->mdcache->get_inode(p->dirino); + if (!diri) { + dout(10) << "ETrace.restore_trace adding dir " << p->dirino << endl; + diri = new CInode(mds->mdcache); + diri->inode.ino = p->dirino; + diri->inode.mode = INODE_MODE_DIR; + mds->mdcache->add_inode(diri); + + CDir *dir = diri->get_or_open_dir(mds); + + // root? import? + if (p == trace.begin()) { + mds->mdcache->add_import(dir); + if (dir->ino() == 1) + mds->mdcache->set_root(diri); + } + } else { + dout(20) << "ETrace.restore_trace had dir " << p->dirino << endl; + diri->get_or_open_dir(mds); + } + assert(diri->dir); + dout(20) << "ETrace.restore_trace dir is " << *diri->dir << endl; + + // the inode + in = mds->mdcache->get_inode(p->inode.ino); + if (!in) { + dout(10) << "ETrace.restore_trace adding dn '" << p->dn << "' inode " << p->inode.ino << endl; + in = new CInode(mds->mdcache); + in->inode = p->inode; + mds->mdcache->add_inode(in); + + // the dentry + CDentry *dn = diri->dir->add_dentry( p->dn, in ); + dn->mark_dirty(); + assert(dn); + } else { + dout(20) << "ETrace.restore_trace had dn '" << p->dn << "' inode " << p->inode.ino << endl; + in->inode = p->inode; + } + dout(20) << "ETrace.restore_trace in is " << *in << endl; + } + return in; +} + + +// ----------------------- +// EMkdir +// - trace goes to new dir's inode. + +bool EMkdir::can_expire(MDS *mds) +{ + // am i obsolete? + CInode *in = mds->mdcache->get_inode( trace.back().inode.ino ); + if (!in) return true; + CDir *dir = in->dir; + if (!dir) return true; + CDir *pdir = in->get_parent_dir(); + assert(pdir); + + dout(10) << "EMkdir.can_expire in is " << *in << endl; + dout(10) << "EMkdir.can_expire inv is " << trace.back().inode.version << endl; + dout(10) << "EMkdir.can_expire dir is " << *dir << endl; + bool commitparent = in->get_last_committed_version() < trace.back().inode.version; + bool commitnew = dir->get_last_committed_version() == 0; + + if (commitparent || commitnew) return false; + return true; +} + +void EMkdir::retire(MDS *mds, Context *c) +{ + // commit parent dir AND my dir + CInode *in = mds->mdcache->get_inode( trace.back().inode.ino ); + assert(in); + CDir *dir = in->dir; + assert(dir); + CDir *pdir = in->get_parent_dir(); + assert(pdir); + + dout(10) << "EMkdir.retire in is " << *in << endl; + dout(10) << "EMkdir.retire inv is " << trace.back().inode.version << endl; + dout(10) << "EMkdir.retire dir is " << *dir << endl; + bool commitparent = in->get_last_committed_version() < trace.back().inode.version; + bool commitnew = dir->get_last_committed_version() == 0; + + if (commitparent && commitnew) { + // both + dout(10) << "EMkdir.retire committing parent+new dir " << *dir << endl; + C_Gather *gather = new C_Gather(c); + mds->mdstore->commit_dir(pdir, gather->new_sub()); + mds->mdstore->commit_dir(dir, gather->new_sub()); + } else if (commitparent) { + // just parent + dout(10) << "EMkdir.retire committing parent dir " << *dir << endl; + mds->mdstore->commit_dir(pdir, c); + } else { + // just new dir + dout(10) << "EMkdir.retire committing new dir " << *dir << endl; + mds->mdstore->commit_dir(dir, c); + } +} + +bool EMkdir::has_happened(MDS *mds) +{ + return false; +} + +void EMkdir::replay(MDS *mds) +{ + dout(10) << "EMkdir.replay " << *this << endl; + CInode *in = trace.restore_trace(mds); + + // mark dir inode dirty + in->mark_dirty(); + + // mark parent dir dirty, and set version. + // this may end up being below water when dir is fetched from disk. + CDir *pdir = in->get_parent_dir(); + if (!pdir->is_dirty()) pdir->mark_dirty(); + pdir->set_version(trace.back().dirv); + + // mark new dir dirty + complete + CDir *dir = in->get_or_open_dir(mds); + dir->mark_dirty(); + dir->mark_complete(); +} + + + +// ----------------------- +// EMknod + +bool EMknod::can_expire(MDS *mds) +{ + // am i obsolete? + CInode *in = mds->mdcache->get_inode( trace.back().inode.ino ); + if (!in) return true; + + if (!in->is_auth()) return true; // not my inode anymore! + if (in->get_version() != trace.back().inode.version) + return true; // i'm obsolete! (another log entry follows) + + if (in->get_last_committed_version() >= trace.back().inode.version) + return true; + + return false; +} + +void EMknod::retire(MDS *mds, Context *c) +{ + // commit parent directory + CInode *diri = mds->mdcache->get_inode( trace.back().dirino ); + assert(diri); + CDir *dir = diri->dir; + assert(dir); + + dout(10) << "EMknod.retire committing parent dir " << *dir << endl; + mds->mdstore->commit_dir(dir, c); +} + +bool EMknod::has_happened(MDS *mds) +{ + return false; +} + +void EMknod::replay(MDS *mds) +{ + dout(10) << "EMknod.replay " << *this << endl; + CInode *in = trace.restore_trace(mds); + in->mark_dirty(); + + // mark parent dir dirty, and set version. + // this may end up being below water when dir is fetched from disk. + CDir *pdir = in->get_parent_dir(); + if (!pdir->is_dirty()) pdir->mark_dirty(); + pdir->set_version(trace.back().dirv); +} + + + +// ----------------------- +// EInodeUpdate + +bool EInodeUpdate::can_expire(MDS *mds) +{ + CInode *in = mds->mdcache->get_inode( trace.back().inode.ino ); + if (!in) return true; + + if (!in->is_auth()) return true; // not my inode anymore! + if (in->get_version() != trace.back().inode.version) + return true; // i'm obsolete! (another log entry follows) + + /* + // frozen -> exporting -> obsolete (FOR NOW?) + if (in->is_frozen()) + return true; + */ + + if (in->get_last_committed_version() >= trace.back().inode.version) + return true; + + return false; +} + +void EInodeUpdate::retire(MDS *mds, Context *c) +{ + // commit parent directory + CInode *diri = mds->mdcache->get_inode( trace.back().dirino ); + assert(diri); + CDir *dir = diri->dir; + assert(dir); + + dout(10) << "EMknod.retire committing parent dir " << *dir << endl; + mds->mdstore->commit_dir(dir, c); +} + +bool EInodeUpdate::has_happened(MDS *mds) +{ + return false; +} + +void EInodeUpdate::replay(MDS *mds) +{ + dout(10) << "EInodeUpdate.replay " << *this << endl; + CInode *in = trace.restore_trace(mds); + in->mark_dirty(); + + // mark parent dir dirty, and set version. + // this may end up being below water when dir is fetched from disk. + CDir *pdir = in->get_parent_dir(); + if (!pdir->is_dirty()) pdir->mark_dirty(); + pdir->set_version(trace.back().dirv); +} + + + +// ----------------------- +// EUnlink + +bool EUnlink::can_expire(MDS *mds) +{ + // dir + CInode *diri = mds->mdcache->get_inode( diritrace.back().inode.ino ); + CDir *dir = 0; + if (diri) dir = diri->dir; + + if (dir && dir->get_last_committed_version() < dirv) return false; + + if (!inodetrace.trace.empty()) { + // inode + CInode *in = mds->mdcache->get_inode( inodetrace.back().inode.ino ); + if (in && in->get_last_committed_version() < inodetrace.back().inode.version) + return false; + } + + return true; +} + +void EUnlink::retire(MDS *mds, Context *c) +{ + CInode *diri = mds->mdcache->get_inode( diritrace.back().inode.ino ); + CDir *dir = diri->dir; + assert(dir); + + // okay! + dout(7) << "commiting dirty (from unlink) dir " << *dir << endl; + mds->mdstore->commit_dir(dir, dirv, c); +} + +bool EUnlink::has_happened(MDS *mds) +{ + return true; +} + +void EUnlink::replay(MDS *mds) +{ +} + + + + +// ----------------------- +// EPurgeFinish + + +bool EPurgeFinish::can_expire(MDS *mds) +{ + return true; +} + +void EPurgeFinish::retire(MDS *mds, Context *c) +{ +} + +bool EPurgeFinish::has_happened(MDS *mds) +{ + return true; +} + +void EPurgeFinish::replay(MDS *mds) +{ +} + + + + -- 2.39.5