From: Sage Weil Date: Tue, 3 Jun 2008 00:23:52 +0000 (-0700) Subject: mds: fragstat rejoin basically works... X-Git-Tag: v0.3~170^2~15 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=8474665f5616a18a2f586963ee37ac465d06872a;p=ceph.git mds: fragstat rejoin basically works... --- diff --git a/src/TODO b/src/TODO index 7d693cee0408..cda0fcbb0c2a 100644 --- a/src/TODO +++ b/src/TODO @@ -68,10 +68,6 @@ mds mustfix - fix rejoin vs updated dirfrag nested/dirlocks - not inode_full, now.. send the fnode. but for auth dirfrags, _to_ inode auth... which is sorta outside the subtree..? -- fix completed_request list on master, for disambiguating slaves. (sessionmap's complete list is trimmed based on client, not slave ops.) - - keep mdrequest open? is this useful for the srci export trimming? - - special list? needs master journal entry to finish commit, and extra message from slave->master after the commit. - /- locks (esp scatter) vs rejoin, scatter_writebehind - make sure locker avoids frozen inodes /- make sure predirty_nested stops if it can't wrlock versionlock (acquire_locks normally hides that detail for us) diff --git a/src/mds/CInode.cc b/src/mds/CInode.cc index 1a9c1faeeaf2..12ded70bc316 100644 --- a/src/mds/CInode.cc +++ b/src/mds/CInode.cc @@ -725,8 +725,6 @@ void CInode::finish_scatter_gather_update(int type) default: assert(0); } - - } diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc index 22300963adc3..f55076d6c540 100644 --- a/src/mds/MDCache.cc +++ b/src/mds/MDCache.cc @@ -1789,7 +1789,17 @@ void MDCache::rejoin_send_rejoins() int auth = dir->get_dir_auth().first; assert(auth >= 0); - if (auth == mds->get_nodeid()) continue; // skip my own regions! + if (mds->is_rejoin() && auth == mds->get_nodeid()) { + // include dirfrag stat? + int inauth = dir->inode->authority().first; + if (rejoins.count(inauth)) { + dout(10) << " sending dirfrag stat to mds" << inauth << " for " << *dir << dendl; + rejoins[inauth]->add_dirfrag_stat(dir->dirfrag(), + dir->fnode.fragstat, + dir->fnode.accounted_fragstat); + } + continue; // skip my own regions! + } if (rejoins.count(auth) == 0) continue; // don't care about this node's regions rejoin_walk(dir, rejoins[auth]); @@ -1929,14 +1939,6 @@ void MDCache::rejoin_walk(CDir *dir, MMDSCacheRejoin *rejoin) assert(dn->inode->is_dir()); rejoin->add_weak_primary_dentry(dir->dirfrag(), p->first, dn->get_inode()->ino()); dn->get_inode()->get_nested_dirfrags(nested); - - if (dn->get_inode()->dirlock.is_updated()) { // ******* FIXME ********* - // include full inode to shed any dirtyscattered state - rejoin->add_full_inode(dn->get_inode()->inode, - dn->get_inode()->symlink, - dn->get_inode()->dirfragtree); - dn->get_inode()->dirlock.clear_updated(); - } } } else { // STRONG @@ -2022,7 +2024,7 @@ void MDCache::handle_cache_rejoin(MMDSCacheRejoin *m) * the sender * - is recovering from their journal. * - may have incorrect (out of date) inode contents - * - will include full inodes IFF they contain dirty scatterlock content + * - will include weak dirfrag if sender is dirfrag auth and parent inode auth is recipient * * if the sender didn't trim_non_auth(), they * - may have incorrect (out of date) dentry/inode linkage @@ -2081,18 +2083,29 @@ void MDCache::handle_cache_rejoin_weak(MMDSCacheRejoin *weak) } } - // full inodes? - // dirty scatterlock content! - for (list::iterator p = weak->full_inodes.begin(); - p != weak->full_inodes.end(); - ++p) { - CInode *in = get_inode(p->inode.ino); - if (!in) continue; - if (p->inode.mtime > in->inode.mtime) in->inode.mtime = p->inode.mtime; - dout(10) << " got dirty inode scatterlock content " << *in << dendl; - in->dirlock.set_updated(); + if (!mds->is_rejoin()) { + // dirfrag stat? we only care if we are a survivor, and possibly + // doing a gather on the scatterlock in which we _need_ the + // replica's data. + for (map >::iterator p = weak->dirfrag_stat.begin(); + p != weak->dirfrag_stat.end(); + p++) { + CDir *dir = get_dirfrag(p->first); + assert(dir); + dout(10) << " got fragstat " << p->second.first << " " << p->second.second + << " for " << p->first << " on " << *dir << dendl; + dir->fnode.fragstat = p->second.first; + dir->fnode.accounted_fragstat = p->second.second; + dir->inode->dirlock.set_updated(); + if (dir->inode->dirlock.must_gather()) { + dout(10) << " completing must_gather gather on " << dir->inode->dirlock + << " on " << *dir->inode << dendl; + dir->inode->dirlock.remove_gather(from); + mds->locker->scatter_eval_gather(&dir->inode->dirlock); + } + } } - + // walk weak map for (map >::iterator p = weak->weak.begin(); p != weak->weak.end(); @@ -2128,12 +2141,12 @@ void MDCache::handle_cache_rejoin_weak(MMDSCacheRejoin *weak) assert(in); if (survivor && in->is_replica(from)) - inode_remove_replica(in, from); // this induces a lock gather completion + inode_remove_replica(in, from, true); // this induces a lock gather completion... usually! int inonce = in->add_replica(from); dout(10) << " have " << *in << dendl; // scatter the dirlock, just in case? - if (!survivor && in->is_dir()) + if (!survivor && in->is_dir() && in->has_subtree_root_dirfrag()) in->dirlock.set_state(LOCK_SCATTER); if (ack) { @@ -3661,7 +3674,7 @@ void MDCache::discard_delayed_expire(CDir *dir) delayed_expire.erase(dir); } -void MDCache::inode_remove_replica(CInode *in, int from) +void MDCache::inode_remove_replica(CInode *in, int from, bool will_readd) { in->remove_replica(from); in->mds_caps_wanted.erase(from); @@ -3672,7 +3685,11 @@ void MDCache::inode_remove_replica(CInode *in, int from) if (in->linklock.remove_replica(from)) mds->locker->simple_eval_gather(&in->linklock); if (in->dirfragtreelock.remove_replica(from)) mds->locker->simple_eval_gather(&in->dirfragtreelock); if (in->filelock.remove_replica(from)) mds->locker->file_eval_gather(&in->filelock); - if (in->dirlock.remove_replica(from)) mds->locker->scatter_eval_gather(&in->dirlock); + + // don't complete gather if we will re-add (i.e. we are rejoining) + // and dirlock must_gather actual data... + if ((!will_readd || !in->dirlock.must_gather()) && + in->dirlock.remove_replica(from)) mds->locker->scatter_eval_gather(&in->dirlock); // alone now? /* diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h index 506808db5178..9c4749214185 100644 --- a/src/mds/MDCache.h +++ b/src/mds/MDCache.h @@ -674,7 +674,7 @@ public: lru.lru_midtouch(dn); } - void inode_remove_replica(CInode *in, int rep); + void inode_remove_replica(CInode *in, int rep, bool will_readd=false); void dentry_remove_replica(CDentry *dn, int rep); void rename_file(CDentry *srcdn, CDentry *destdn); diff --git a/src/mds/ScatterLock.h b/src/mds/ScatterLock.h index e1adeb9d91f5..84416dcf2b38 100644 --- a/src/mds/ScatterLock.h +++ b/src/mds/ScatterLock.h @@ -103,6 +103,12 @@ public: } } + // true if we are gathering and need the replica's data to be consistent + bool must_gather() { + return (state == LOCK_GTEMPSYNCC || + state == LOCK_GLOCKC); + } + void set_updated() { if (!updated) { parent->get(MDSCacheObject::PIN_DIRTYSCATTERED); diff --git a/src/messages/MMDSCacheRejoin.h b/src/messages/MMDSCacheRejoin.h index 7d44af72cc3e..d687b1fa4150 100644 --- a/src/messages/MMDSCacheRejoin.h +++ b/src/messages/MMDSCacheRejoin.h @@ -159,6 +159,7 @@ class MMDSCacheRejoin : public Message { int32_t op; // weak + map > dirfrag_stat; map > weak; set weak_inodes; @@ -214,6 +215,10 @@ class MMDSCacheRejoin : public Message { } // dirfrags + void add_dirfrag_stat(dirfrag_t df, const frag_info_t &fs, const frag_info_t &afs) { + dirfrag_stat[df].first = fs; + dirfrag_stat[df].second = afs; + } void add_weak_dirfrag(dirfrag_t df) { weak[df]; } @@ -250,6 +255,7 @@ class MMDSCacheRejoin : public Message { ::encode(xlocked_inodes, payload); ::encode(cap_export_bl, payload); ::encode(strong_dirfrags, payload); + ::encode(dirfrag_stat, payload); ::encode(weak, payload); ::encode(weak_inodes, payload); ::encode(strong_dentries, payload); @@ -270,6 +276,7 @@ class MMDSCacheRejoin : public Message { ::decode(cap_export_paths, q); } ::decode(strong_dirfrags, p); + ::decode(dirfrag_stat, p); ::decode(weak, p); ::decode(weak_inodes, p); ::decode(strong_dentries, p);