From 33b9f700be74480178a3374e81d636f49126ec46 Mon Sep 17 00:00:00 2001 From: sageweil Date: Fri, 29 Jun 2007 00:44:48 +0000 Subject: [PATCH] * fixed migrator shutdown/fail notify waiter bug * some prelim scatterlock work for updated flag.. * projected_inode, and associated Server method rewrites (way cleaner!) git-svn-id: https://ceph.svn.sf.net/svnroot/ceph@1455 29311d96-e01e-0410-9327-a35deaab8ce9 --- branches/sage/cephmds2/TODO | 27 +- branches/sage/cephmds2/mds/CInode.cc | 55 +-- branches/sage/cephmds2/mds/CInode.h | 32 ++ branches/sage/cephmds2/mds/Locker.cc | 41 +- branches/sage/cephmds2/mds/Migrator.cc | 24 +- branches/sage/cephmds2/mds/ScatterLock.h | 16 +- branches/sage/cephmds2/mds/Server.cc | 349 +++++++----------- branches/sage/cephmds2/mds/Server.h | 2 +- branches/sage/cephmds2/mds/events/EMetaBlob.h | 4 +- 9 files changed, 286 insertions(+), 264 deletions(-) diff --git a/branches/sage/cephmds2/TODO b/branches/sage/cephmds2/TODO index 9072986197ceb..793aed50a515b 100644 --- a/branches/sage/cephmds2/TODO +++ b/branches/sage/cephmds2/TODO @@ -47,6 +47,29 @@ sage doc sage mds +- fix utime to use acquire_locks.. don't call into locker manually! + +- projected inode content skipping. + - roll projected_version in to a *projected_inode. clean up the *pi code in Server.cc. + - list projected_inode and list projected_dirfragtree, &push_projected_*(), etc. + +- make locks auth_pin for unstable states. + +- fix rename to delay the _apply. + - need to fix locking vs migration first. + +- then, mtimes: + - avoid migration race concern (on auth). + - writeback and dirty on gather. + - cleaned up pv/pi makes writebehind play nice with concurrent updates. + - should pin lock in LOCK state? + - scatterlock 'updated' flag. + - on replica, clear only on sync | rescatter. + - make sure "dirty" scatterlock prevents journal expire. + - EMetaBlob map dirty_scatter; + - mtime must be greater, or scatterlock must be !updated. + + /- fix slave op commit/abort logic: / - recovering node needs to know what stray prepare ops committed / - include with import_map @@ -70,7 +93,7 @@ sage mds - revisit wrlocks, dir inode mtime updates. esp in rename. - if auth, pin and be happy. decide early. - make no attempt to dirty inodes until a gather - - pin scattered inodes +/ - pin scattered inodes - mtime will always get journaled... -> so, just make sure v/pv/dirtyness is sane on recovery... -> scatterlock should recover into scatter state, or whatever... @@ -87,6 +110,8 @@ sage mds - need to export stray crap to another mds.. - verify stray is empty on shutdown +- roll EAlloc into EMetaBlob (and maybe Purge) + - journal+recovery - file capabilities i/o - dirfrag split/merge diff --git a/branches/sage/cephmds2/mds/CInode.cc b/branches/sage/cephmds2/mds/CInode.cc index 65a7938670d78..05018b8265502 100644 --- a/branches/sage/cephmds2/mds/CInode.cc +++ b/branches/sage/cephmds2/mds/CInode.cc @@ -365,16 +365,16 @@ void CInode::encode_lock_state(int type, bufferlist& bl) { switch (type) { case LOCK_OTYPE_IAUTH: - ::_encode(inode.ctime, bl); - ::_encode(inode.mode, bl); - ::_encode(inode.uid, bl); - ::_encode(inode.gid, bl); + _encode(inode.ctime, bl); + _encode(inode.mode, bl); + _encode(inode.uid, bl); + _encode(inode.gid, bl); break; case LOCK_OTYPE_ILINK: - ::_encode(inode.ctime, bl); - ::_encode(inode.nlink, bl); - ::_encode(inode.anchored, bl); + _encode(inode.ctime, bl); + _encode(inode.nlink, bl); + _encode(inode.anchored, bl); break; case LOCK_OTYPE_IDIRFRAGTREE: @@ -382,21 +382,21 @@ void CInode::encode_lock_state(int type, bufferlist& bl) break; case LOCK_OTYPE_IFILE: - ::_encode(inode.size, bl); - ::_encode(inode.mtime, bl); - ::_encode(inode.atime, bl); + _encode(inode.size, bl); + _encode(inode.mtime, bl); + _encode(inode.atime, bl); break; case LOCK_OTYPE_IDIR: - ::_encode(inode.mtime, bl); - { + _encode(inode.mtime, bl); + if (0) { map dfsz; for (map::iterator p = dirfrags.begin(); p != dirfrags.end(); ++p) if (p->second->is_auth()) dfsz[p->first] = p->second->get_nitems(); - ::_encode(dfsz, bl); + _encode(dfsz, bl); } break; @@ -412,18 +412,18 @@ void CInode::decode_lock_state(int type, bufferlist& bl) switch (type) { case LOCK_OTYPE_IAUTH: - ::_decode(tm, bl, off); + _decode(tm, bl, off); if (inode.ctime < tm) inode.ctime = tm; - ::_decode(inode.mode, bl, off); - ::_decode(inode.uid, bl, off); - ::_decode(inode.gid, bl, off); + _decode(inode.mode, bl, off); + _decode(inode.uid, bl, off); + _decode(inode.gid, bl, off); break; case LOCK_OTYPE_ILINK: - ::_decode(tm, bl, off); + _decode(tm, bl, off); if (inode.ctime < tm) inode.ctime = tm; - ::_decode(inode.nlink, bl, off); - ::_decode(inode.anchored, bl, off); + _decode(inode.nlink, bl, off); + _decode(inode.anchored, bl, off); break; case LOCK_OTYPE_IDIRFRAGTREE: @@ -431,16 +431,19 @@ void CInode::decode_lock_state(int type, bufferlist& bl) break; case LOCK_OTYPE_IFILE: - ::_decode(inode.size, bl, off); - ::_decode(inode.mtime, bl, off); - ::_decode(inode.atime, bl, off); + _decode(inode.size, bl, off); + _decode(inode.mtime, bl, off); + _decode(inode.atime, bl, off); break; case LOCK_OTYPE_IDIR: //::_decode(inode.size, bl, off); - ::_decode(tm, bl, off); - if (inode.mtime < tm) inode.mtime = tm; - { + _decode(tm, bl, off); + if (inode.mtime < tm) { + inode.mtime = tm; + dirlock.set_updated(); + } + if (0) { map dfsz; ::_decode(dfsz, bl, off); // hmm which to keep? diff --git a/branches/sage/cephmds2/mds/CInode.h b/branches/sage/cephmds2/mds/CInode.h index 4bc00344f7726..c0fc9c5881ffb 100644 --- a/branches/sage/cephmds2/mds/CInode.h +++ b/branches/sage/cephmds2/mds/CInode.h @@ -64,6 +64,7 @@ class CInode : public MDSCacheObject { static const int PIN_OPENINGDIR = 14; static const int PIN_REMOTEPARENT = 15; static const int PIN_BATCHOPENJOURNAL = 16; + static const int PIN_SCATTERED = 17; const char *pin_name(int p) { switch (p) { @@ -76,6 +77,7 @@ class CInode : public MDSCacheObject { case PIN_OPENINGDIR: return "openingdir"; case PIN_REMOTEPARENT: return "remoteparent"; case PIN_BATCHOPENJOURNAL: return "batchopenjournal"; + case PIN_SCATTERED: return "scattered"; default: return generic_pin_name(p); } } @@ -119,6 +121,36 @@ class CInode : public MDSCacheObject { off_t last_open_journaled; // log offset for the last journaled EOpen + // projected values (only defined while dirty) + list projected_inode; + list projected_dirfragtree; + + inode_t *project_inode() { + if (projected_inode.empty()) + projected_inode.push_back(inode); + else + projected_inode.push_back(projected_inode.back()); + return &projected_inode.back(); + } + fragtree_t *project_dirfragtree() { + if (projected_dirfragtree.empty()) + projected_dirfragtree.push_back(dirfragtree); + else + projected_dirfragtree.push_back(projected_dirfragtree.back()); + return &projected_dirfragtree.back(); + } + void pop_and_dirty_projected_inode() { + mark_dirty(projected_inode.front().version); + inode = projected_inode.front(); + projected_inode.pop_front(); + } + void pop_and_dirty_projected_dirfragtree() { + mark_dirty(projected_inode.front().version); + dirfragtree = projected_dirfragtree.front(); + projected_dirfragtree.pop_front(); + } + + // -- cache infrastructure -- map dirfrags; // cached dir fragments diff --git a/branches/sage/cephmds2/mds/Locker.cc b/branches/sage/cephmds2/mds/Locker.cc index 80ea84dae4da8..a974da76374b6 100644 --- a/branches/sage/cephmds2/mds/Locker.cc +++ b/branches/sage/cephmds2/mds/Locker.cc @@ -136,12 +136,13 @@ bool Locker::acquire_locks(MDRequest *mdr, // make list of items to authpin set mustpin = xlocks; - /* don't auth_pin wrlocks.. they're a moving target! (might import while an update is in progress) - for (set::iterator p = wrlocks.begin(); p != wrlocks.end(); ++p) + for (set::iterator p = wrlocks.begin(); p != wrlocks.end(); ++p) { if ((*p)->get_parent()->is_auth()) mustpin.insert(*p); - */ - + else + sorted.insert(*p); + } + map > mustpin_remote; // mds -> (object set) // can i auth pin them all now? @@ -1300,7 +1301,7 @@ bool Locker::scatter_wrlock_start(ScatterLock *lock, MDRequest *mdr) !lock->is_rdlocked() && !lock->is_xlocked() && lock->get_state() == LOCK_SYNC) - scatter_scatter(lock); + scatter_lock(lock); // can wrlock? if (lock->can_wrlock()) { @@ -1377,6 +1378,7 @@ void Locker::scatter_eval(ScatterLock *lock) auth, MDS_PORT_LOCKER); } lock->set_state(LOCK_LOCK); + //lock->get_parent()->put(CInode::PIN_SCATTERED); } } else { @@ -1400,6 +1402,15 @@ void Locker::scatter_eval(ScatterLock *lock) dout(7) << "scatter_eval finished lock gather/un-wrlock on " << *lock << " on " << *lock->get_parent() << endl; lock->set_state(LOCK_LOCK); + //lock->get_parent()->put(CInode::PIN_SCATTERED); + + if (lock->is_updated()) { + // updated flag is set: we got new data during the gather. + // write-behind journal. + // version_t v + + } + lock->finish_waiters(ScatterLock::WAIT_XLOCK|ScatterLock::WAIT_STABLE); } @@ -1427,6 +1438,7 @@ void Locker::scatter_eval(ScatterLock *lock) dout(7) << "scatter_eval finished scatter un-rdlock(/gather) on " << *lock << " on " << *lock->get_parent() << endl; lock->set_state(LOCK_SCATTER); + //lock->get_parent()->get(CInode::PIN_SCATTERED); lock->finish_waiters(ScatterLock::WAIT_WR|ScatterLock::WAIT_STABLE); if (lock->get_parent()->is_replicated()) { @@ -1499,8 +1511,10 @@ void Locker::scatter_sync(ScatterLock *lock) send_lock_message(lock, LOCK_AC_LOCK); lock->init_gather(); } else { - if (!lock->is_wrlocked()) + if (!lock->is_wrlocked()) { + //lock->get_parent()->put(CInode::PIN_SCATTERED); break; // do it now, we're fine + } } lock->set_state(LOCK_GLOCKC); return; @@ -1567,6 +1581,7 @@ void Locker::scatter_scatter(ScatterLock *lock) send_lock_message(lock, LOCK_AC_SCATTER, data); } lock->set_state(LOCK_SCATTER); + //lock->get_parent()->get(CInode::PIN_SCATTERED); lock->finish_waiters(ScatterLock::WAIT_WR|ScatterLock::WAIT_STABLE); } @@ -1596,9 +1611,11 @@ void Locker::scatter_lock(ScatterLock *lock) case LOCK_SCATTER: if (!lock->is_wrlocked() && - !lock->get_parent()->is_replicated()) + !lock->get_parent()->is_replicated()) { + //lock->get_parent()->put(CInode::PIN_SCATTERED); break; // do it. - + } + if (lock->get_parent()->is_replicated()) { send_lock_message(lock, LOCK_AC_LOCK); lock->init_gather(); @@ -1641,8 +1658,10 @@ void Locker::scatter_tempsync(ScatterLock *lock) case LOCK_SCATTER: if (!lock->is_wrlocked() && - !lock->get_parent()->is_replicated()) + !lock->get_parent()->is_replicated()) { + //lock->get_parent()->put(CInode::PIN_SCATTERED); break; // do it. + } if (lock->get_parent()->is_replicated()) { send_lock_message(lock, LOCK_AC_LOCK); @@ -1705,6 +1724,9 @@ void Locker::handle_scatter_lock(ScatterLock *lock, MLock *m) << " on " << *lock->get_parent() << endl; lock->set_state(LOCK_GLOCKS); } else { + //if (lock->get_state() == LOCK_SCATTER) + //lock->get_parent()->put(CInode::PIN_SCATTERED); + // encode and reply bufferlist data; lock->encode_locked_state(data); @@ -1718,6 +1740,7 @@ void Locker::handle_scatter_lock(ScatterLock *lock, MLock *m) assert(lock->get_state() == LOCK_LOCK); lock->decode_locked_state(m->get_data()); lock->set_state(LOCK_SCATTER); + //lock->get_parent()->get(CInode::PIN_SCATTERED); lock->finish_waiters(ScatterLock::WAIT_WR|ScatterLock::WAIT_STABLE); break; diff --git a/branches/sage/cephmds2/mds/Migrator.cc b/branches/sage/cephmds2/mds/Migrator.cc index c6cc1098b8539..c14aa6fc28aef 100644 --- a/branches/sage/cephmds2/mds/Migrator.cc +++ b/branches/sage/cephmds2/mds/Migrator.cc @@ -254,25 +254,27 @@ void Migrator::handle_mds_failure_or_stop(int who) } } else { // bystander failed. - if (p->second == EXPORT_WARNING) { - // exporter waiting for warning acks, let's fake theirs. - if (export_warning_ack_waiting[dir].count(who)) { + if (export_warning_ack_waiting.count(dir) && + export_warning_ack_waiting[dir].count(who)) { + export_warning_ack_waiting[dir].erase(who); + export_notify_ack_waiting[dir].erase(who); // they won't get a notify either. + if (p->second == EXPORT_WARNING) { + // exporter waiting for warning acks, let's fake theirs. dout(10) << "faking export_warning_ack from mds" << who << " on " << *dir << " to mds" << export_peer[dir] << endl; - export_warning_ack_waiting[dir].erase(who); - export_notify_ack_waiting[dir].erase(who); // they won't get a notify either. if (export_warning_ack_waiting[dir].empty()) export_go(dir); } } - if (p->second == EXPORT_NOTIFYING) { - // exporter is waiting for notify acks, fake it - if (export_notify_ack_waiting[dir].count(who)) { + if (export_notify_ack_waiting.count(dir) && + export_notify_ack_waiting[dir].count(who)) { + export_notify_ack_waiting[dir].erase(who); + if (p->second == EXPORT_NOTIFYING) { + // exporter is waiting for notify acks, fake it dout(10) << "faking export_notify_ack from mds" << who << " on " << *dir << " to mds" << export_peer[dir] << endl; - export_notify_ack_waiting[dir].erase(who); if (export_notify_ack_waiting[dir].empty()) export_finish(dir); } @@ -1725,6 +1727,10 @@ void Migrator::import_finish(CDir *dir, bool now) cache->show_subtrees(); audit(); + // re-eval scatterlock? + if (dir->inode->is_auth()) + mds->locker->scatter_eval(&dir->inode->dirlock); + // is it empty? if (dir->get_size() == 0 && !dir->inode->is_auth()) { diff --git a/branches/sage/cephmds2/mds/ScatterLock.h b/branches/sage/cephmds2/mds/ScatterLock.h index 3d8d9a0664020..85e6da1cd63d1 100644 --- a/branches/sage/cephmds2/mds/ScatterLock.h +++ b/branches/sage/cephmds2/mds/ScatterLock.h @@ -31,9 +31,9 @@ #define LOCK_LOCK__ // . W . . #define LOCK_GTEMPSYNCL -21 // . w LOCK on replica. -#define LOCK_GLOCKC -22 // . w . w waiting for replicas+wrlocks (auth), or wrlocks to release (replica) -#define LOCK_SCATTER 23 // . W . W mtime updates on replicas allowed, no reads. stable here. -#define LOCK_GTEMPSYNCC -24 // . w . w GLOCKC|LOCK on replica +#define LOCK_GLOCKC -22 // . wp . wp waiting for replicas+wrlocks (auth), or wrlocks to release (replica) +#define LOCK_SCATTER 23 // . Wp . WP mtime updates on replicas allowed, no reads. stable here. +#define LOCK_GTEMPSYNCC -24 // . wp . wp GLOCKC|LOCK on replica #define LOCK_GSCATTERT -25 // r . LOCK on replica. #define LOCK_GLOCKT -26 // r . LOCK on replica. @@ -64,10 +64,12 @@ inline const char *get_scatterlock_state_name(int s) { class ScatterLock : public SimpleLock { int num_wrlock; - + bool updated; + public: ScatterLock(MDSCacheObject *o, int t, int wo) : SimpleLock(o, t, wo), - num_wrlock(0) {} + num_wrlock(0), + updated(false) {} int get_replica_state() { switch (state) { @@ -94,6 +96,10 @@ public: assert(0); } } + + void set_updated() { updated = true; } + void clear_updated() { updated = false; } + bool is_updated() { return updated; } void replicate_relax() { //if (state == LOCK_SYNC && !is_rdlocked()) diff --git a/branches/sage/cephmds2/mds/Server.cc b/branches/sage/cephmds2/mds/Server.cc index 468b49ff63a12..7d35db01240d8 100644 --- a/branches/sage/cephmds2/mds/Server.cc +++ b/branches/sage/cephmds2/mds/Server.cc @@ -1199,18 +1199,20 @@ CDir* Server::try_open_dir(CInode *diri, frag_t fg, MDRequest *mdr) * predirty the directory inode for a new dentry, if it is auth (and not root) * BUG: root inode doesn't get dirtied properly, currently. blech. */ -version_t Server::predirty_dn_diri(MDRequest *mdr, CDentry *dn, EMetaBlob *blob, utime_t mtime) +version_t Server::predirty_dn_diri(MDRequest *mdr, CDentry *dn, EMetaBlob *blob) { version_t dirpv = 0; CInode *diri = dn->dir->inode; - if (diri->is_auth() && !diri->is_root() && + if (diri->is_auth() && + !diri->is_root() && mdr->wrlocks.count(&diri->dirlock)) { // only if we've wrlocked it. dirpv = diri->pre_dirty(); - inode_t *pi = blob->add_primary_dentry(diri->get_parent_dn(), true); + inode_t *pi = diri->project_inode(); pi->version = dirpv; - pi->ctime = pi->mtime = mtime; - dout(10) << "predirty_dn_diri ctime/mtime " << mtime << " pv " << dirpv << " on " << *diri << endl; + pi->ctime = pi->mtime = mdr->now; + blob->add_primary_dentry(diri->get_parent_dn(), true, 0, pi); + dout(10) << "predirty_dn_diri ctime/mtime " << mdr->now << " pv " << dirpv << " on " << *diri << endl; } return dirpv; @@ -1232,6 +1234,11 @@ void Server::dirty_dn_diri(CDentry *dn, version_t dirpv, utime_t mtime) // we were before, too. diri->mark_dirty(dirpv); dout(10) << "dirty_dn_diri ctime/mtime " << mtime << " v " << diri->inode.version << " on " << *diri << endl; + } else { + assert(!diri->is_auth() || diri->is_root() || + diri->is_frozen()); // then not auth, or still importing. + // dirlock scatterlock will propagate the update. + } /* any writebehind should be handled by the lock gather probably? } else { @@ -1242,9 +1249,6 @@ void Server::dirty_dn_diri(CDentry *dn, version_t dirpv, utime_t mtime) // (something after the import, or the import itself) } */ - } else { - // we're not auth. dirlock scatterlock will propagate the update. - } } @@ -1271,8 +1275,9 @@ void Server::dirty_diri_mtime_writebehind(CInode *diri, utime_t mtime) // we're newly auth. write-behind. EUpdate *le = new EUpdate("dir.mtime writebehind"); le->metablob.add_dir_context(diri->get_parent_dn()->get_dir()); - inode_t *pi = le->metablob.add_primary_dentry(diri->get_parent_dn(), true); + inode_t *pi = diri->project_inode(); pi->version = diri->pre_dirty(); + le->metablob.add_primary_dentry(diri->get_parent_dn(), true, 0, pi); mds->mdlog->submit_entry(le); mds->mdlog->wait_for_sync(new C_MDS_DirtyDiriMtimeWB(this, diri, pi->version)); @@ -1325,26 +1330,20 @@ void Server::handle_client_stat(MDRequest *mdr) /* - * finisher: do a inode_file_write_finish and reply. + * finisher for basic inode updates */ -class C_MDS_utime_finish : public Context { +class C_MDS_inode_update_finish : public Context { MDS *mds; MDRequest *mdr; CInode *in; - version_t pv; - utime_t mtime, atime; public: - C_MDS_utime_finish(MDS *m, MDRequest *r, CInode *i, version_t pdv, utime_t mt, utime_t at) : - mds(m), mdr(r), in(i), - pv(pdv), - mtime(mt), atime(at) { } + C_MDS_inode_update_finish(MDS *m, MDRequest *r, CInode *i) : + mds(m), mdr(r), in(i) { } void finish(int r) { assert(r == 0); // apply - in->inode.mtime = mtime; - in->inode.atime = atime; - in->mark_dirty(pv); + in->pop_and_dirty_projected_inode(); // reply MClientReply *reply = new MClientReply(mdr->client_request, 0); @@ -1362,64 +1361,34 @@ void Server::handle_client_utime(MDRequest *mdr) CInode *cur = rdlock_path_pin_ref(mdr, true); if (!cur) return; - // write - if (!mds->locker->xlock_start(&cur->filelock, mdr)) + // xlock inode + set rdlocks = mdr->rdlocks; + set wrlocks = mdr->wrlocks; + set xlocks = mdr->xlocks; + xlocks.insert(&cur->filelock); + if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks)) return; mds->balancer->hit_inode(cur, META_POP_IWR); - // prepare - version_t pdv = cur->pre_dirty(); - utime_t mtime = req->args.utime.mtime; - utime_t atime = req->args.utime.atime; - C_MDS_utime_finish *fin = new C_MDS_utime_finish(mds, mdr, cur, pdv, - mtime, atime); + // project update + inode_t *pi = cur->project_inode(); + pi->mtime = req->args.utime.mtime; + pi->atime = req->args.utime.atime; + pi->version = cur->pre_dirty(); + pi->ctime = g_clock.real_now(); // log + wait EUpdate *le = new EUpdate("utime"); le->metablob.add_client_req(req->get_reqid()); le->metablob.add_dir_context(cur->get_parent_dir()); - inode_t *pi = le->metablob.add_dentry(cur->parent, true); - pi->mtime = mtime; - pi->atime = mtime; - pi->ctime = g_clock.real_now(); - pi->version = pdv; + le->metablob.add_primary_dentry(cur->parent, true, 0, pi); mdlog->submit_entry(le); - mdlog->wait_for_sync(fin); + mdlog->wait_for_sync(new C_MDS_inode_update_finish(mds, mdr, cur)); } -// -------------- - -/* - * finisher: do a inode_hard_xlock_finish and reply. - */ -class C_MDS_chmod_finish : public Context { - MDS *mds; - MDRequest *mdr; - CInode *in; - version_t pv; - int mode; -public: - C_MDS_chmod_finish(MDS *m, MDRequest *r, CInode *i, version_t pdv, int mo) : - mds(m), mdr(r), in(i), pv(pdv), mode(mo) { } - void finish(int r) { - assert(r == 0); - - // apply - in->inode.mode &= ~04777; - in->inode.mode |= (mode & 04777); - in->mark_dirty(pv); - - // reply - MClientReply *reply = new MClientReply(mdr->client_request, 0); - reply->set_result(0); - mds->server->reply_request(mdr, reply, in); - } -}; - - // chmod void Server::handle_client_chmod(MDRequest *mdr) @@ -1429,58 +1398,34 @@ void Server::handle_client_chmod(MDRequest *mdr) if (!cur) return; // write - if (!mds->locker->xlock_start(&cur->authlock, mdr)) + set rdlocks = mdr->rdlocks; + set wrlocks = mdr->wrlocks; + set xlocks = mdr->xlocks; + xlocks.insert(&cur->authlock); + if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks)) return; mds->balancer->hit_inode(cur, META_POP_IWR); - // prepare - version_t pdv = cur->pre_dirty(); - int mode = req->args.chmod.mode; - C_MDS_chmod_finish *fin = new C_MDS_chmod_finish(mds, mdr, cur, pdv, - mode); + // project update + inode_t *pi = cur->project_inode(); + pi->mode = req->args.chmod.mode & 04777; + pi->version = cur->pre_dirty(); + pi->ctime = g_clock.real_now(); // log + wait EUpdate *le = new EUpdate("chmod"); le->metablob.add_client_req(req->get_reqid()); le->metablob.add_dir_context(cur->get_parent_dir()); - inode_t *pi = le->metablob.add_dentry(cur->parent, true); - pi->mode = mode; - pi->version = pdv; - pi->ctime = g_clock.real_now(); - + le->metablob.add_primary_dentry(cur->parent, true, 0, pi); + mdlog->submit_entry(le); - mdlog->wait_for_sync(fin); + mdlog->wait_for_sync(new C_MDS_inode_update_finish(mds, mdr, cur)); } // chown -class C_MDS_chown_finish : public Context { - MDS *mds; - MDRequest *mdr; - CInode *in; - version_t pv; - int uid, gid; -public: - C_MDS_chown_finish(MDS *m, MDRequest *r, CInode *i, version_t pdv, int u, int g) : - mds(m), mdr(r), in(i), pv(pdv), uid(u), gid(g) { } - void finish(int r) { - assert(r == 0); - - // apply - if (uid >= 0) in->inode.uid = uid; - if (gid >= 0) in->inode.gid = gid; - in->mark_dirty(pv); - - // reply - MClientReply *reply = new MClientReply(mdr->client_request, 0); - reply->set_result(0); - mds->server->reply_request(mdr, reply, in); - } -}; - - void Server::handle_client_chown(MDRequest *mdr) { MClientRequest *req = mdr->client_request; @@ -1488,30 +1433,30 @@ void Server::handle_client_chown(MDRequest *mdr) if (!cur) return; // write - if (!mds->locker->xlock_start(&cur->authlock, mdr)) + set rdlocks = mdr->rdlocks; + set wrlocks = mdr->wrlocks; + set xlocks = mdr->xlocks; + xlocks.insert(&cur->authlock); + if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks)) return; mds->balancer->hit_inode(cur, META_POP_IWR); - // prepare - version_t pdv = cur->pre_dirty(); - int uid = req->args.chown.uid; - int gid = req->args.chown.gid; - C_MDS_chown_finish *fin = new C_MDS_chown_finish(mds, mdr, cur, pdv, - uid, gid); - + // project update + inode_t *pi = cur->project_inode(); + pi->uid = MAX(req->args.chown.uid, 0); + pi->gid = MAX(req->args.chown.gid, 0); + pi->version = cur->pre_dirty(); + pi->ctime = g_clock.real_now(); + // log + wait EUpdate *le = new EUpdate("chown"); le->metablob.add_client_req(req->get_reqid()); le->metablob.add_dir_context(cur->get_parent_dir()); - inode_t *pi = le->metablob.add_dentry(cur->parent, true); - if (uid >= 0) pi->uid = uid; - if (gid >= 0) pi->gid = gid; - pi->version = pdv; - pi->ctime = g_clock.real_now(); + le->metablob.add_primary_dentry(cur->parent, true, 0, pi); mdlog->submit_entry(le); - mdlog->wait_for_sync(fin); + mdlog->wait_for_sync(new C_MDS_inode_update_finish(mds, mdr, cur)); } @@ -1629,12 +1574,11 @@ class C_MDS_mknod_finish : public Context { MDRequest *mdr; CDentry *dn; CInode *newi; - version_t pv; version_t dirpv; public: C_MDS_mknod_finish(MDS *m, MDRequest *r, CDentry *d, CInode *ni, version_t dirpv_) : mds(m), mdr(r), dn(d), newi(ni), - pv(d->get_projected_version()), dirpv(dirpv_) {} + dirpv(dirpv_) {} void finish(int r) { assert(r == 0); @@ -1642,7 +1586,7 @@ public: dn->get_dir()->link_inode(dn, newi); // dirty inode, dn, dir - newi->mark_dirty(pv); + newi->mark_dirty(newi->inode.version + 1); // dir inode's mtime mds->server->dirty_dn_diri(dn, dirpv, newi->inode.ctime); @@ -1658,7 +1602,6 @@ public: }; - void Server::handle_client_mknod(MDRequest *mdr) { MClientRequest *req = mdr->client_request; @@ -1670,25 +1613,21 @@ void Server::handle_client_mknod(MDRequest *mdr) assert(newi); // it's a file. - dn->pre_dirty(); newi->inode.mode = req->args.mknod.mode; newi->inode.mode &= ~INODE_TYPE_MASK; newi->inode.mode |= INODE_MODE_FILE; + newi->inode.version = dn->pre_dirty() - 1; // prepare finisher EUpdate *le = new EUpdate("mknod"); le->metablob.add_client_req(req->get_reqid()); - - version_t dirpv = predirty_dn_diri(mdr, dn, &le->metablob, newi->inode.ctime); // dir mtime too - + version_t dirpv = predirty_dn_diri(mdr, dn, &le->metablob); // dir mtime too le->metablob.add_dir_context(dn->dir); - inode_t *pi = le->metablob.add_primary_dentry(dn, true, newi); - pi->version = dn->get_projected_version(); + le->metablob.add_primary_dentry(dn, true, newi, &newi->inode); // log + wait - C_MDS_mknod_finish *fin = new C_MDS_mknod_finish(mds, mdr, dn, newi, dirpv); mdlog->submit_entry(le); - mdlog->wait_for_sync(fin); + mdlog->wait_for_sync(new C_MDS_mknod_finish(mds, mdr, dn, newi, dirpv)); } @@ -1707,11 +1646,11 @@ void Server::handle_client_mkdir(MDRequest *mdr) assert(newi); // it's a directory. - dn->pre_dirty(); newi->inode.mode = req->args.mkdir.mode; newi->inode.mode &= ~INODE_TYPE_MASK; newi->inode.mode |= INODE_MODE_DIR; newi->inode.layout = g_OSD_MDDirLayout; + newi->inode.version = dn->pre_dirty() - 1; // ...and that new dir is empty. CDir *newdir = newi->get_or_open_dirfrag(mds->mdcache, frag_t()); @@ -1721,16 +1660,14 @@ void Server::handle_client_mkdir(MDRequest *mdr) // prepare finisher EUpdate *le = new EUpdate("mkdir"); le->metablob.add_client_req(req->get_reqid()); - version_t dirpv = predirty_dn_diri(mdr, dn, &le->metablob, newi->inode.ctime); // dir mtime too + version_t dirpv = predirty_dn_diri(mdr, dn, &le->metablob); // dir mtime too le->metablob.add_dir_context(dn->dir); - inode_t *pi = le->metablob.add_primary_dentry(dn, true, newi); - pi->version = dn->get_projected_version(); + le->metablob.add_primary_dentry(dn, true, newi, &newi->inode); le->metablob.add_dir(newdir, true); // log + wait - C_MDS_mknod_finish *fin = new C_MDS_mknod_finish(mds, mdr, dn, newi, dirpv); mdlog->submit_entry(le); - mdlog->wait_for_sync(fin); + mdlog->wait_for_sync(new C_MDS_mknod_finish(mds, mdr, dn, newi, dirpv)); /* old export heuristic. pbly need to reimplement this at some point. @@ -1762,23 +1699,21 @@ void Server::handle_client_symlink(MDRequest *mdr) assert(newi); // it's a symlink - dn->pre_dirty(); newi->inode.mode &= ~INODE_TYPE_MASK; newi->inode.mode |= INODE_MODE_SYMLINK; newi->symlink = req->get_sarg(); + newi->inode.version = dn->pre_dirty() - 1; // prepare finisher EUpdate *le = new EUpdate("symlink"); le->metablob.add_client_req(req->get_reqid()); - version_t dirpv = predirty_dn_diri(mdr, dn, &le->metablob, newi->inode.ctime); // dir mtime too + version_t dirpv = predirty_dn_diri(mdr, dn, &le->metablob); // dir mtime too le->metablob.add_dir_context(dn->dir); - inode_t *pi = le->metablob.add_primary_dentry(dn, true, newi); - pi->version = dn->get_projected_version(); + le->metablob.add_primary_dentry(dn, true, newi, &newi->inode); // log + wait - C_MDS_mknod_finish *fin = new C_MDS_mknod_finish(mds, mdr, dn, newi, dirpv); mdlog->submit_entry(le); - mdlog->wait_for_sync(fin); + mdlog->wait_for_sync(new C_MDS_mknod_finish(mds, mdr, dn, newi, dirpv)); } @@ -1887,18 +1822,17 @@ class C_MDS_link_local_finish : public Context { MDRequest *mdr; CDentry *dn; CInode *targeti; - version_t dpv; - version_t tpv; + version_t dnpv; + version_t tipv; version_t dirpv; public: - C_MDS_link_local_finish(MDS *m, MDRequest *r, CDentry *d, CInode *ti, version_t dirpv_) : + C_MDS_link_local_finish(MDS *m, MDRequest *r, CDentry *d, CInode *ti, + version_t dnpv_, version_t tipv_, version_t dirpv_) : mds(m), mdr(r), dn(d), targeti(ti), - dpv(d->get_projected_version()), - tpv(targeti->get_parent_dn()->get_projected_version()), - dirpv(dirpv_) { } + dnpv(dnpv_), tipv(tipv_), dirpv(dirpv_) { } void finish(int r) { assert(r == 0); - mds->server->_link_local_finish(mdr, dn, targeti, dpv, tpv, dirpv); + mds->server->_link_local_finish(mdr, dn, targeti, dnpv, tipv, dirpv); } }; @@ -1907,52 +1841,42 @@ void Server::_link_local(MDRequest *mdr, CDentry *dn, CInode *targeti) { dout(10) << "_link_local " << *dn << " to " << *targeti << endl; - // ok, let's do it. - // prepare log entry + // predirty NEW dentry + version_t dnpv = dn->pre_dirty(); + version_t tipv = targeti->pre_dirty(); + + // project inode update + inode_t *pi = targeti->project_inode(); + pi->nlink++; + pi->ctime = mdr->now; + pi->version = tipv; + + // log + wait EUpdate *le = new EUpdate("link_local"); le->metablob.add_client_req(mdr->reqid); - - // predirty - dn->pre_dirty(); - version_t tpdv = targeti->pre_dirty(); - - // add to event - utime_t now = g_clock.real_now(); - version_t dirpv = predirty_dn_diri(mdr, dn, &le->metablob, mdr->now); // dir inode's mtime + version_t dirpv = predirty_dn_diri(mdr, dn, &le->metablob); // dir inode's mtime le->metablob.add_dir_context(dn->get_dir()); le->metablob.add_remote_dentry(dn, true, targeti->ino()); // new remote le->metablob.add_dir_context(targeti->get_parent_dir()); - inode_t *pi = le->metablob.add_primary_dentry(targeti->parent, true, targeti); // update old primary - - // update journaled target inode - pi->nlink++; - pi->ctime = mdr->now; - pi->version = tpdv; + le->metablob.add_primary_dentry(targeti->parent, true, targeti, pi); // update old primary - // finisher - C_MDS_link_local_finish *fin = new C_MDS_link_local_finish(mds, mdr, dn, targeti, dirpv); - - // log + wait mdlog->submit_entry(le); - mdlog->wait_for_sync(fin); + mdlog->wait_for_sync(new C_MDS_link_local_finish(mds, mdr, dn, targeti, dnpv, tipv, dirpv)); } void Server::_link_local_finish(MDRequest *mdr, CDentry *dn, CInode *targeti, - version_t dpv, version_t tpv, version_t dirpv) + version_t dnpv, version_t tipv, version_t dirpv) { dout(10) << "_link_local_finish " << *dn << " to " << *targeti << endl; - // link and unlock the new dentry + // link and unlock the NEW dentry dn->dir->link_inode(dn, targeti->ino()); - dn->set_version(dpv); - dn->mark_dirty(dpv); + dn->mark_dirty(dnpv); - // update the target - targeti->inode.nlink++; - targeti->inode.ctime = mdr->now; - targeti->mark_dirty(tpv); + // target inode + targeti->pop_and_dirty_projected_inode(); - // dir inode's mtime + // new dentry dir mtime dirty_dn_diri(dn, dirpv, mdr->now); // bump target popularity @@ -2004,28 +1928,23 @@ void Server::_link_remote(MDRequest *mdr, CDentry *dn, CInode *targeti) } dout(10) << " targeti auth has prepared nlink++" << endl; - // 2. create+journal new dentry, as with link_local. - // prepare log entry - EUpdate *le = new EUpdate("link_remote"); - le->metablob.add_client_req(mdr->reqid); - - // predirty + // go. + // predirty dentry dn->pre_dirty(); // add to event - version_t dirpv = predirty_dn_diri(mdr, dn, &le->metablob, mdr->now); // dir inode's mtime + EUpdate *le = new EUpdate("link_remote"); + le->metablob.add_client_req(mdr->reqid); + version_t dirpv = predirty_dn_diri(mdr, dn, &le->metablob); // dir inode's mtime le->metablob.add_dir_context(dn->get_dir()); le->metablob.add_remote_dentry(dn, true, targeti->ino()); // new remote - // finisher - C_MDS_link_remote_finish *fin = new C_MDS_link_remote_finish(mds, mdr, dn, targeti, dirpv); - // mark committing (needed for proper recovery) mdr->committing = true; // log + wait mdlog->submit_entry(le); - mdlog->wait_for_sync(fin); + mdlog->wait_for_sync(new C_MDS_link_remote_finish(mds, mdr, dn, targeti, dirpv)); } void Server::_link_remote_finish(MDRequest *mdr, CDentry *dn, CInode *targeti, @@ -2035,7 +1954,6 @@ void Server::_link_remote_finish(MDRequest *mdr, CDentry *dn, CInode *targeti, // link the new dentry dn->dir->link_inode(dn, targeti->ino()); - dn->set_version(dpv); dn->mark_dirty(dpv); // dir inode's mtime @@ -2101,7 +2019,8 @@ void Server::handle_slave_link_prep(MDRequest *mdr) // add to event le->metablob.add_dir_context(targeti->get_parent_dir()); - inode_t *pi = le->metablob.add_primary_dentry(dn, true, targeti); // update old primary + + inode_t *pi = dn->inode->project_inode(); // update journaled target inode bool inc; @@ -2114,6 +2033,7 @@ void Server::handle_slave_link_prep(MDRequest *mdr) } pi->ctime = mdr->now; pi->version = tpv; + le->metablob.add_primary_dentry(dn, true, targeti, pi); // update old primary mds->mdlog->submit_entry(le, new C_MDS_SlaveLinkPrep(this, mdr, targeti, tpv, inc)); } @@ -2356,31 +2276,32 @@ void Server::_unlink_local(MDRequest *mdr, CDentry *dn, CDentry *straydn) le->metablob.add_client_req(mdr->reqid); version_t ipv = 0; // dirty inode version - inode_t *pi = 0; // the inode - + inode_t *ji = 0; // journaled projected inode if (dn->is_primary()) { // primary link. add stray dentry. assert(straydn); ipv = straydn->pre_dirty(dn->inode->inode.version); le->metablob.add_dir_context(straydn->dir); - pi = le->metablob.add_primary_dentry(straydn, true, dn->inode); + ji = le->metablob.add_primary_dentry(straydn, true, dn->inode); } else { // remote link. update remote inode. ipv = dn->inode->pre_dirty(); le->metablob.add_dir_context(dn->inode->get_parent_dir()); - pi = le->metablob.add_primary_dentry(dn->inode->parent, true, dn->inode); // update primary + ji = le->metablob.add_primary_dentry(dn->inode->parent, true, dn->inode); } - // the unlinked dentry - dn->pre_dirty(); - version_t dirpv = predirty_dn_diri(mdr, dn, &le->metablob, mdr->now); - le->metablob.add_dir_context(dn->get_dir()); - le->metablob.add_null_dentry(dn, true); - // update journaled target inode + inode_t *pi = dn->inode->project_inode(); pi->nlink--; pi->ctime = mdr->now; pi->version = ipv; + *ji = *pi; // copy into journal + + // the unlinked dentry + dn->pre_dirty(); + version_t dirpv = predirty_dn_diri(mdr, dn, &le->metablob); + le->metablob.add_dir_context(dn->get_dir()); + le->metablob.add_null_dentry(dn, true); if (mdr->dst_reanchor_atid) le->metablob.add_anchor_transaction(mdr->dst_reanchor_atid); @@ -2495,7 +2416,7 @@ void Server::_unlink_remote(MDRequest *mdr, CDentry *dn) // the unlinked dentry dn->pre_dirty(); - version_t dirpv = predirty_dn_diri(mdr, dn, &le->metablob, mdr->now); + version_t dirpv = predirty_dn_diri(mdr, dn, &le->metablob); le->metablob.add_dir_context(dn->get_dir()); le->metablob.add_null_dentry(dn, true); @@ -2946,7 +2867,11 @@ void Server::_rename_prepare(MDRequest *mdr, bool linkmerge = (srcdn->inode == destdn->inode && (srcdn->is_primary() || destdn->is_primary())); - inode_t *pi = 0; // inode getting nlink-- + mdr->pvmap[destdn->dir->inode] = predirty_dn_diri(mdr, destdn, metablob); + if (destdn->dir != srcdn->dir) + mdr->pvmap[srcdn->dir->inode] = predirty_dn_diri(mdr, srcdn, metablob); + + inode_t *ji = 0; // journaled inode getting nlink-- version_t ipv; // it's version if (linkmerge) { @@ -2956,7 +2881,7 @@ void Server::_rename_prepare(MDRequest *mdr, metablob->add_dir_context(destdn->dir); if (destdn->is_auth()) ipv = mdr->pvmap[destdn] = destdn->pre_dirty(destdn->inode->inode.version); - pi = metablob->add_primary_dentry(destdn, true, destdn->inode); + ji = metablob->add_primary_dentry(destdn, true, destdn->inode); // do src dentry metablob->add_dir_context(srcdn->dir); @@ -2974,7 +2899,7 @@ void Server::_rename_prepare(MDRequest *mdr, metablob->add_dir_context(straydn->dir); if (straydn->is_auth()) ipv = mdr->pvmap[straydn] = straydn->pre_dirty(destdn->inode->inode.version); - pi = metablob->add_primary_dentry(straydn, true, destdn->inode); + ji = metablob->add_primary_dentry(straydn, true, destdn->inode); } else if (destdn->is_remote()) { // remote. @@ -2982,7 +2907,7 @@ void Server::_rename_prepare(MDRequest *mdr, metablob->add_dir_context(destdn->inode->get_parent_dir()); if (destdn->inode->is_auth()) ipv = mdr->pvmap[destdn->inode] = destdn->inode->pre_dirty(); - pi = metablob->add_primary_dentry(destdn->inode->parent, true, destdn->inode); // update primary + ji = metablob->add_primary_dentry(destdn->inode->parent, true, destdn->inode); // update primary dout(10) << "remote targeti (nlink--) is " << *destdn->inode << endl; } else { @@ -3014,11 +2939,13 @@ void Server::_rename_prepare(MDRequest *mdr, metablob->add_null_dentry(srcdn, true); } - if (pi) { + if (ji) { // update journaled target inode + inode_t *pi = destdn->inode->project_inode(); pi->nlink--; pi->ctime = mdr->now; pi->version = ipv; + *ji = *pi; // copy into journal } // anchor updates? @@ -3041,12 +2968,10 @@ void Server::_rename_apply(MDRequest *mdr, CDentry *srcdn, CDentry *destdn, CDen (srcdn->is_primary() || destdn->is_primary())); // dir mtimes - /* - dirty_dn_diri(destdn, ddirpv, ictime); + dirty_dn_diri(destdn, mdr->pvmap[destdn->dir->inode], mdr->now); if (destdn->dir != srcdn->dir) - dirty_dn_diri(srcdn, sdirpv, ictime); - */ - + dirty_dn_diri(srcdn, mdr->pvmap[srcdn->dir->inode], mdr->now); + if (linkmerge) { if (destdn->is_primary()) { dout(10) << "merging remote onto primary link" << endl; @@ -3729,14 +3654,14 @@ void Server::handle_client_openc(MDRequest *mdr) dn->pre_dirty(); in->inode.mode = req->args.open.mode; in->inode.mode |= INODE_MODE_FILE; + in->inode.version = dn->get_projected_version(); // prepare finisher C_MDS_openc_finish *fin = new C_MDS_openc_finish(mds, mdr, dn, in); EUpdate *le = new EUpdate("openc"); le->metablob.add_client_req(req->get_reqid()); le->metablob.add_dir_context(dn->dir); - inode_t *pi = le->metablob.add_primary_dentry(dn, true, in); - pi->version = dn->get_projected_version(); + le->metablob.add_primary_dentry(dn, true, in, &in->inode); // log + wait mdlog->submit_entry(le); diff --git a/branches/sage/cephmds2/mds/Server.h b/branches/sage/cephmds2/mds/Server.h index b7e2197c522e5..ccfb6a16ddf27 100644 --- a/branches/sage/cephmds2/mds/Server.h +++ b/branches/sage/cephmds2/mds/Server.h @@ -78,7 +78,7 @@ public: CDir* try_open_auth_dir(CInode *diri, frag_t fg, MDRequest *mdr); //CDir* try_open_dir(CInode *diri, frag_t fg, MDRequest *mdr); - version_t predirty_dn_diri(MDRequest *mdr, CDentry *dn, class EMetaBlob *blob, utime_t mtime); + version_t predirty_dn_diri(MDRequest *mdr, CDentry *dn, class EMetaBlob *blob); void dirty_dn_diri(CDentry *dn, version_t dirpv, utime_t mtime); void dirty_diri_mtime_writebehind(CInode *diri, utime_t mtime); diff --git a/branches/sage/cephmds2/mds/events/EMetaBlob.h b/branches/sage/cephmds2/mds/events/EMetaBlob.h index 5a5fd2a14e155..538ac4d28eda8 100644 --- a/branches/sage/cephmds2/mds/events/EMetaBlob.h +++ b/branches/sage/cephmds2/mds/events/EMetaBlob.h @@ -267,7 +267,7 @@ class EMetaBlob { } // return remote pointer to to-be-journaled inode - inode_t *add_primary_dentry(CDentry *dn, bool dirty, CInode *in=0) { + inode_t *add_primary_dentry(CDentry *dn, bool dirty, CInode *in=0, inode_t *pi=0, fragtree_t *pdft=0) { if (!in) in = dn->get_inode(); dirlump& lump = add_dir(dn->get_dir(), false); @@ -278,12 +278,14 @@ class EMetaBlob { dn->get_projected_version(), in->inode, in->symlink, dirty)); + if (pi) lump.get_dfull().front().inode = *pi; return &lump.get_dfull().front().inode; } else { lump.get_dfull().push_back(fullbit(dn->get_name(), dn->get_projected_version(), in->inode, in->symlink, dirty)); + if (pi) lump.get_dfull().back().inode = *pi; return &lump.get_dfull().back().inode; } } -- 2.39.5