From: Sage Weil Date: Thu, 29 May 2008 16:06:20 +0000 (-0700) Subject: mds: fixed up accounting bugs with scatterlock X-Git-Tag: v0.3~170^2~30 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=01df6cdea9d705620e89a7a4a34770f9c4690ae0;p=ceph.git mds: fixed up accounting bugs with scatterlock --- diff --git a/src/TODO b/src/TODO index 9aae898bf847..ccdc88d7ef97 100644 --- a/src/TODO +++ b/src/TODO @@ -86,6 +86,8 @@ mds mustfix mds - bind lease, cap timeouts to session renew, so that they can be _longer_ than the session renew interval +- can we get rid of the dirlock remote auth_pin weirdness on subtree roots? + - lease length heuristics - mds lock last_change stamp? diff --git a/src/include/types.h b/src/include/types.h index 61472191ef7c..2186085872f9 100644 --- a/src/include/types.h +++ b/src/include/types.h @@ -234,6 +234,7 @@ struct frag_info_t { } void encode(bufferlist &bl) const { + ::encode(version, bl); ::encode(mtime, bl); ::encode(nfiles, bl); ::encode(nsubdirs, bl); @@ -244,6 +245,7 @@ struct frag_info_t { ::encode(rctime, bl); } void decode(bufferlist::iterator &bl) { + ::decode(version, bl); ::decode(mtime, bl); ::decode(nfiles, bl); ::decode(nsubdirs, bl); diff --git a/src/kernel/messenger.c b/src/kernel/messenger.c index bd32cd321b2f..183989b81aec 100644 --- a/src/kernel/messenger.c +++ b/src/kernel/messenger.c @@ -1595,9 +1595,9 @@ out: void ceph_msg_put(struct ceph_msg *m) { - BUG_ON(atomic_read(&m->nref) <= 0); dout(20, "ceph_msg_put %p %d -> %d\n", m, atomic_read(&m->nref), atomic_read(&m->nref)-1); + BUG_ON(atomic_read(&m->nref) <= 0); if (atomic_dec_and_test(&m->nref)) { dout(20, "ceph_msg_put last one on %p\n", m); WARN_ON(!list_empty(&m->list_head)); diff --git a/src/mds/CDir.cc b/src/mds/CDir.cc index f2678eee69dc..2b1ef59f7e12 100644 --- a/src/mds/CDir.cc +++ b/src/mds/CDir.cc @@ -53,13 +53,17 @@ ostream& operator<<(ostream& out, CDir& dir) if (dir.is_replicated()) out << dir.get_replicas(); - out << " pv=" << dir.get_projected_version(); + if (dir.is_projected()) + out << " pv=" << dir.get_projected_version(); out << " v=" << dir.get_version(); out << " cv=" << dir.get_committing_version(); out << "/" << dir.get_committed_version(); out << "/" << dir.get_committed_version_equivalent(); } else { - out << " rep@" << dir.authority(); + pair a = dir.authority(); + out << " rep@" << a.first; + if (a.second != CDIR_AUTH_UNKNOWN) + out << "," << a.second; if (dir.get_replica_nonce() > 1) out << "." << dir.get_replica_nonce(); } @@ -88,12 +92,16 @@ ostream& operator<<(ostream& out, CDir& dir) if (dir.state_test(CDir::STATE_EXPORTBOUND)) out << "|exportbound"; if (dir.state_test(CDir::STATE_IMPORTBOUND)) out << "|importbound"; + //out << " " << dir.fnode.fragstat; out << " s=" << dir.fnode.fragstat.size() << "=" << dir.fnode.fragstat.nfiles << "+" << dir.fnode.fragstat.nsubdirs; - out << " rb=" << dir.fnode.fragstat.rbytes << "/" << dir.fnode.accounted_fragstat.rbytes; - out << " rf=" << dir.fnode.fragstat.rfiles << "/" << dir.fnode.accounted_fragstat.rfiles; - out << " rd=" << dir.fnode.fragstat.rsubdirs << "/" << dir.fnode.accounted_fragstat.rsubdirs; + out << " rb=" << dir.fnode.fragstat.rbytes; + if (dir.is_projected()) out << "/" << dir.fnode.accounted_fragstat.rbytes; + out << " rf=" << dir.fnode.fragstat.rfiles; + if (dir.is_projected()) out << "/" << dir.fnode.accounted_fragstat.rfiles; + out << " rd=" << dir.fnode.fragstat.rsubdirs; + if (dir.is_projected()) out << "/" << dir.fnode.accounted_fragstat.rsubdirs; out << " sz=" << dir.get_nitems() << "+" << dir.get_nnull(); if (dir.get_num_dirty()) @@ -1583,6 +1591,7 @@ void CDir::adjust_nested_anchors(int by) inode->adjust_nested_anchors(by); } +#ifdef MDS_VERIFY_FRAGSTAT void CDir::verify_fragstat() { assert(is_complete()); @@ -1626,6 +1635,7 @@ void CDir::verify_fragstat() dout(0) << "verify_fragstat ok " << fnode.fragstat << " on " << *this << dendl; } } +#endif /***************************************************************************** * FREEZING diff --git a/src/mds/CInode.cc b/src/mds/CInode.cc index d5954fc16ca8..42cfc6c9e184 100644 --- a/src/mds/CInode.cc +++ b/src/mds/CInode.cc @@ -54,7 +54,10 @@ ostream& operator<<(ostream& out, CInode& in) if (in.is_replicated()) out << in.get_replicas(); } else { - out << "rep@" << in.authority(); + pair a = in.authority(); + out << "rep@" << a.first; + if (a.second != CDIR_AUTH_UNKNOWN) + out << "," << a.second; out << "." << in.get_replica_nonce(); assert(in.get_replica_nonce() >= 0); } @@ -70,25 +73,37 @@ ostream& operator<<(ostream& out, CInode& in) if (in.is_freezing_inode()) out << " FREEZING=" << in.auth_pin_freeze_allowance; if (in.is_frozen_inode()) out << " FROZEN"; - out << " nl=" << in.inode.nlink; - - out << " s=" << in.inode.size; - out << " rb=" << in.inode.dirstat.rbytes << "/" << in.inode.accounted_dirstat.rbytes; - out << " rf=" << in.inode.dirstat.rfiles << "/" << in.inode.accounted_dirstat.rfiles; - out << " rd=" << in.inode.dirstat.rsubdirs << "/" << in.inode.accounted_dirstat.rsubdirs; + if (in.inode.is_dir()) { + out << " ds=" << in.inode.dirstat.size() << "=" + << in.inode.dirstat.nfiles << "+" << in.inode.dirstat.nsubdirs; + //out << " " << in.inode.dirstat; + //if (in.inode.dirstat.version > 10000) out << " BADDIRSTAT"; + } else { + out << " s=" << in.inode.size; + if (in.inode.max_size) + out << "/" << in.inode.max_size; + //out << " nl=" << in.inode.nlink; + } + out << " rb=" << in.inode.dirstat.rbytes; + if (in.is_projected()) out << "/" << in.inode.accounted_dirstat.rbytes; + out << " rf=" << in.inode.dirstat.rfiles; + if (in.is_projected()) out << "/" << in.inode.accounted_dirstat.rfiles; + out << " rd=" << in.inode.dirstat.rsubdirs; + if (in.is_projected()) out << "/" << in.inode.accounted_dirstat.rsubdirs; + // locks out << " " << in.authlock; out << " " << in.linklock; out << " " << in.dirfragtreelock; - out << " " << in.filelock; - out << " " << in.dirlock; + if (in.inode.is_dir()) + out << " " << in.dirlock; + else + out << " " << in.filelock; out << " " << in.xattrlock; - if (in.inode.max_size) - out << " size=" << in.inode.size << "/" << in.inode.max_size; - // hack: spit out crap on which clients have caps + /* if (!in.get_client_caps().empty()) { out << " caps={"; for (map::iterator it = in.get_client_caps().begin(); @@ -99,6 +114,7 @@ ostream& operator<<(ostream& out, CInode& in) } out << "}"; } + */ if (in.get_num_ref()) { out << " |"; @@ -528,6 +544,7 @@ void CInode::encode_lock_state(int type, bufferlist& bl) case CEPH_LOCK_IDIR: { + dout(15) << "encode_lock_state inode.dirstat is " << inode.dirstat << dendl; ::encode(inode.dirstat, bl); // only meaningful if i am auth. bufferlist tmp; __u32 n = 0; @@ -536,9 +553,12 @@ void CInode::encode_lock_state(int type, bufferlist& bl) ++p) if (is_auth() || p->second->is_auth()) { dout(15) << "encode_lock_state fragstat for " << *p->second << dendl; + dout(20) << " fragstat " << p->second->fnode.fragstat << dendl; + dout(20) << " accounted_fragstat " << p->second->fnode.accounted_fragstat << dendl; frag_t fg = p->second->dirfrag().frag; ::encode(fg, tmp); ::encode(p->second->fnode.fragstat, tmp); + ::encode(p->second->fnode.accounted_fragstat, tmp); n++; } ::encode(n, bl); @@ -610,26 +630,34 @@ void CInode::decode_lock_state(int type, bufferlist& bl) { frag_info_t dirstat; ::decode(dirstat, p); - if (!is_auth()) + if (!is_auth()) { + dout(10) << " taking inode dirstat " << dirstat << " for " << *this << dendl; inode.dirstat = dirstat; // take inode summation if replica + } __u32 n; ::decode(n, p); while (n--) { frag_t fg; frag_info_t fragstat; + frag_info_t accounted_fragstat; ::decode(fg, p); ::decode(fragstat, p); + ::decode(accounted_fragstat, p); CDir *dir = get_dirfrag(fg); if (is_auth()) { assert(dir); // i am auth; i had better have this dir open - if (!(dir->fnode.fragstat == fragstat)) { - dout(10) << " got changed fragstat " << fragstat << " != old " << dir->fnode.fragstat - << ", setting updated flag" << dendl; + if (!(fragstat == accounted_fragstat)) { + dout(10) << " got changed fragstat " << fragstat << " on " << *dir << dendl; + dout(20) << " accounted_fragstat " << accounted_fragstat << dendl; + dir->fnode.fragstat = fragstat; + dir->fnode.accounted_fragstat = accounted_fragstat; dirlock.set_updated(); + } else { + assert(dir->fnode.fragstat == fragstat); } - dir->fnode.fragstat = fragstat; } else { if (dir && + dir->is_auth() && !(dir->fnode.accounted_fragstat == fragstat)) { dout(10) << " setting accounted_fragstat " << fragstat << " and setting dirty bit on " << *dir << dendl; @@ -675,19 +703,19 @@ void CInode::finish_scatter_gather_update(int type) // adjust summation assert(is_auth()); inode_t *pi = get_projected_inode(); - dout(20) << " orig dirstat " << pi->dirstat << dendl; + dout(20) << " orig dirstat " << pi->dirstat << dendl; pi->dirstat.version++; for (map::iterator p = dirfrags.begin(); p != dirfrags.end(); p++) { fnode_t *pf = p->second->get_projected_fnode(); - dout(20) << " frag " << p->first - << " " << pf->fragstat - << " " << pf->accounted_fragstat << dendl; + dout(20) << " frag " << p->first << " " << *p->second << dendl; + dout(20) << " fragstat " << pf->fragstat << dendl; + dout(20) << " accounted_fragstat " << pf->fragstat << dendl; pi->dirstat.take_diff(pf->fragstat, pf->accounted_fragstat); } - dout(20) << " final dirstat " << pi->dirstat << dendl; + dout(20) << " final dirstat " << pi->dirstat << dendl; } break; diff --git a/src/mds/CInode.h b/src/mds/CInode.h index cd766104e41d..dd71aa9b8358 100644 --- a/src/mds/CInode.h +++ b/src/mds/CInode.h @@ -148,6 +148,9 @@ class CInode : public MDSCacheObject { else return projected_inode.back()->version; } + bool is_projected() { + return !projected_inode.empty(); + } inode_t *get_projected_inode() { if (projected_inode.empty()) diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc index c5276b6c03ab..1c0431cabdef 100644 --- a/src/mds/MDCache.cc +++ b/src/mds/MDCache.cc @@ -4017,9 +4017,10 @@ int MDCache::path_traverse(MDRequest *mdr, Message *req, // who } assert(curdir); - // HACK +#ifdef MDS_VERIFY_FRAGSTAT if (curdir->is_complete()) curdir->verify_fragstat(); +#endif // frozen? /* diff --git a/src/mds/Migrator.cc b/src/mds/Migrator.cc index 53eaaddf48b8..b3bd3e2b36d4 100644 --- a/src/mds/Migrator.cc +++ b/src/mds/Migrator.cc @@ -958,8 +958,10 @@ int Migrator::encode_export_dir(bufferlist& exportbl, assert(dir->get_projected_version() == dir->get_version()); +#ifdef MDS_VERIFY_FRAGSTAT if (dir->is_complete()) dir->verify_fragstat(); +#endif // dir dirfrag_t df = dir->dirfrag(); @@ -2169,9 +2171,11 @@ int Migrator::decode_import_dir(bufferlist::iterator& blp, le->metablob.add_dentry(dn, dn->is_dirty()); } +#ifdef MDS_VERIFY_FRAGSTAT if (dir->is_complete()) dir->verify_fragstat(); - +#endif + dout(7) << "decode_import_dir done " << *dir << dendl; return num_imported; } diff --git a/src/mds/Server.cc b/src/mds/Server.cc index 11758cc6015d..0e9fffb06fe1 100644 --- a/src/mds/Server.cc +++ b/src/mds/Server.cc @@ -629,8 +629,11 @@ void Server::set_trace_dist(Session *session, MClientReply *reply, CInode *in, C dout(20) << " trace added " << lmask << " " << *dn << dendl; // dir +#ifdef MDS_VERIFY_FRAGSTAT if (dn->get_dir()->is_complete()) dn->get_dir()->verify_fragstat(); +#endif + DirStat::encode(bl, dn->get_dir(), whoami); dout(20) << " trace added " << *dn->get_dir() << dendl; @@ -1848,7 +1851,9 @@ void Server::handle_client_readdir(MDRequest *mdr) return; } +#ifdef MDS_VERIFY_FRAGSTAT dir->verify_fragstat(); +#endif mdr->now = g_clock.real_now(); diff --git a/src/mds/mdstypes.h b/src/mds/mdstypes.h index 9b342b6607f3..ce657482f919 100644 --- a/src/mds/mdstypes.h +++ b/src/mds/mdstypes.h @@ -20,7 +20,7 @@ using namespace std; #include "include/xlist.h" #define MDS_REF_SET // define me for improved debug output, sanity checking - +//#define MDS_VERIFY_FRAGSTAT // do do (slow) sanity checking on frags #define MDS_PORT_CACHE 0x200 #define MDS_PORT_LOCKER 0x300 diff --git a/src/messages/MClientReply.h b/src/messages/MClientReply.h index 91189f5ec026..52606d32821d 100644 --- a/src/messages/MClientReply.h +++ b/src/messages/MClientReply.h @@ -139,6 +139,7 @@ struct InodeStat { max_size = e.max_size; rdev = e.rdev; + memset(&dirstat, 0, sizeof(dirstat)); dirstat.nfiles = e.files; dirstat.nsubdirs = e.subdirs; dirstat.rctime.decode_timeval(&e.rctime);