From: Sage Weil Date: Thu, 22 May 2008 21:20:30 +0000 (-0700) Subject: mds: some prelim nesting updates X-Git-Tag: v0.3~170^2~89 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=89ca370cc097bb9be0f5000f641ce609ee9e2e38;p=ceph.git mds: some prelim nesting updates --- diff --git a/src/TODO b/src/TODO index f56542fdbe2..849057d4cc9 100644 --- a/src/TODO +++ b/src/TODO @@ -88,6 +88,8 @@ mds - dentry versions vs dirfrags... - failure during reconnect vs clientmap. +- make truncate faster with a trunc_seq, attached to objects as attributes? + - inode.rctime (recursive mtime)? - make inode.size reflect directory size (number of entries)? diff --git a/src/include/ceph_fs.h b/src/include/ceph_fs.h index 7c73f462f7a..00a000a2325 100644 --- a/src/include/ceph_fs.h +++ b/src/include/ceph_fs.h @@ -626,8 +626,8 @@ struct ceph_mds_reply_inode { __le32 mode, uid, gid; __le32 nlink; __le64 size, max_size; - __le64 nested_size; - struct ceph_timespec nested_ctime; + __le64 rbytes, rfiles; + struct ceph_timespec rctime; __le32 rdev; struct ceph_frag_tree_head fragtree; } __attribute__ ((packed)); diff --git a/src/include/types.h b/src/include/types.h index a9272ba3c0c..86a81beef6e 100644 --- a/src/include/types.h +++ b/src/include/types.h @@ -200,16 +200,19 @@ struct FileLayout { }; struct nested_info_t { - uint64_t nested_size; // \sum_{children}(size + nested_size) - utime_t nested_ctime; // \max_{children}(ctime, nested_ctime) + utime_t rctime; // \max_{children}(ctime, nested_ctime) + __u64 rbytes; + __u64 rfiles; void encode(bufferlist &bl) const { - ::encode(nested_size, bl); - ::encode(nested_ctime, bl); + ::encode(rbytes, bl); + ::encode(rfiles, bl); + ::encode(rctime, bl); } void decode(bufferlist::iterator &bl) { - ::decode(nested_size, bl); - ::decode(nested_ctime, bl); + ::decode(rbytes, bl); + ::decode(rfiles, bl); + ::decode(rctime, bl); } }; WRITE_CLASS_ENCODER(nested_info_t) @@ -240,7 +243,12 @@ struct inode_t { uint64_t time_warp_seq; // count of (potential) mtime/atime timewarps (i.e., utimes()) // dirfrag, recursive accounting - nested_info_t nested; // inline summation + nested_info_t accounted_nested; // what dirfrag has seen + nested_info_t nested; // inline summation for child dirfrags. + /* + * if accounted_nested does not match nested, the parent dirfrag needs to be + * adjusted by the difference. + */ // special stuff version_t version; // auth only diff --git a/src/mds/CInode.h b/src/mds/CInode.h index 9eede635965..8ac19a921c9 100644 --- a/src/mds/CInode.h +++ b/src/mds/CInode.h @@ -144,7 +144,6 @@ class CInode : public MDSCacheObject { // projected values (only defined while dirty) list projected_inode; - list projected_dirfragtree; version_t get_projected_version() { if (projected_inode.empty()) diff --git a/src/mds/Locker.cc b/src/mds/Locker.cc index 2a620f5c96a..ae0d83edb0c 100644 --- a/src/mds/Locker.cc +++ b/src/mds/Locker.cc @@ -1207,7 +1207,7 @@ void Locker::revoke_client_leases(SimpleLock *lock) } - +// nested --------------------------------------------------------------- diff --git a/src/mds/Locker.h b/src/mds/Locker.h index f645bbeb16f..4195f4a25a4 100644 --- a/src/mds/Locker.h +++ b/src/mds/Locker.h @@ -135,7 +135,9 @@ protected: void scatter_tempsync(ScatterLock *lock); bool scatter_rdlock_start(ScatterLock *lock, MDRequest *mdr); void scatter_rdlock_finish(ScatterLock *lock, MDRequest *mdr); - bool scatter_wrlock_start(ScatterLock *lock, MDRequest *mdr); +public: + bool scatter_wrlock_start(ScatterLock *lock, MDRequest *mdr); // public for Server's predirty_nested +protected: void scatter_wrlock_finish(ScatterLock *lock, MDRequest *mdr); void scatter_writebehind(ScatterLock *lock); diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h index 7e5cc7f1e0e..95d5c472258 100644 --- a/src/mds/MDCache.h +++ b/src/mds/MDCache.h @@ -118,6 +118,9 @@ struct MDRequest { bool committing; bool aborted; + // for applying projected inode changes + list projected_inodes; + // break rarely-used fields into a separately allocated structure // to save memory for most ops struct More { @@ -222,6 +225,17 @@ struct MDRequest { } auth_pins.clear(); } + + void add_projected_inode(CInode *in) { + projected_inodes.push_back(in); + } + void pop_and_dirty_projected_inodes() { + while (!projected_inodes.empty()) { + CInode *in = projected_inodes.front(); + projected_inodes.pop_front(); + in->pop_and_dirty_projected_inode(ls); + } + } }; inline ostream& operator<<(ostream& out, MDRequest &mdr) diff --git a/src/mds/Server.cc b/src/mds/Server.cc index be81268ad1d..febbe399c45 100644 --- a/src/mds/Server.cc +++ b/src/mds/Server.cc @@ -1539,6 +1539,81 @@ void Server::dirty_dn_diri(MDRequest *mdr, CDentry *dn, version_t dirpv) } } +void Server::predirty_nested(MDRequest *mdr, EMetaBlob *blob, CInode *in, CDir *parent) +{ + if (!parent) + parent = in->get_parent_dir(); + + // initial diff from *in + inode_t *curi = in->get_projected_inode(); + __u64 drbytes; + __u64 drfiles; + utime_t rctime; + if (in->is_dir()) { + drbytes = curi->nested.rbytes - curi->accounted_nested.rbytes; + drfiles = curi->nested.rfiles - curi->accounted_nested.rfiles; + rctime = MAX(curi->ctime, curi->nested.rctime); + } else { + drbytes = curi->size - curi->accounted_nested.rbytes; + drfiles = 1 - curi->accounted_nested.rfiles; + rctime = curi->ctime; + } + + blob->add_dir_context(in->get_parent_dir()); + + // build list of inodes to wrlock, dirty, and update + list ls; + CInode *cur = in; + while (parent) { + assert(cur->is_auth()); + assert(parent->is_auth()); + + // opportunistically adjust parent dirfrag + CInode *pin = parent->get_inode(); + if (!pin->dirlock.can_wrlock()) { + dout(10) << " can't wrlock " << pin->dirlock << " on " << *pin << dendl; + break; + } + bool r = mds->locker->scatter_wrlock_start(&pin->dirlock, mdr); + assert(r); + + if (!pin->is_auth()) { + break; + } + + // project update + version_t ppv = pin->pre_dirty(); + inode_t *pi = pin->project_inode(); + pi->version = ppv; + pi->nested.rbytes += drbytes; + pi->nested.rfiles += drfiles; + pi->nested.rctime = rctime; + mdr->add_projected_inode(pin); + ls.push_back(pin); + + frag_t fg = parent->dirfrag().frag; + pin->dirfrag_nested[fg].rbytes += drbytes; + pin->dirfrag_nested[fg].rfiles += drfiles; + pin->dirfrag_nested[fg].rctime = rctime; + + curi->accounted_nested.rbytes += drbytes; + curi->accounted_nested.rfiles += drfiles; + curi->accounted_nested.rctime = rctime; + + cur = pin; + curi = pi; + parent = cur->get_parent_dir(); + } + + // now, stick it in the blob + for (list::iterator p = ls.begin(); + p != ls.end(); + p++) { + CInode *cur = *p; + inode_t *pi = cur->get_projected_inode(); + blob->add_primary_dentry(cur->get_parent_dn(), true, 0, pi); + } +} @@ -2142,7 +2217,7 @@ void Server::handle_client_symlink(MDRequest *mdr) newi->symlink = req->get_path2(); newi->inode.size = newi->symlink.length(); newi->inode.version = dn->pre_dirty() - 1; - + // prepare finisher mdr->ls = mdlog->get_current_segment(); EUpdate *le = new EUpdate(mdlog, "symlink"); @@ -2150,8 +2225,9 @@ void Server::handle_client_symlink(MDRequest *mdr) le->metablob.add_allocated_ino(newi->ino(), mds->idalloc->get_version()); version_t dirpv = predirty_dn_diri(mdr, dn, &le->metablob); // dir mtime too le->metablob.add_dir_context(dn->dir); + predirty_nested(mdr, &le->metablob, newi, dn->dir); le->metablob.add_primary_dentry(dn, true, newi, &newi->inode); - + // log + wait mdlog->submit_entry(le, new C_MDS_mknod_finish(mds, mdr, dn, newi, dirpv)); } diff --git a/src/mds/Server.h b/src/mds/Server.h index f7974975ae8..c28a7726f2e 100644 --- a/src/mds/Server.h +++ b/src/mds/Server.h @@ -102,6 +102,7 @@ public: version_t predirty_dn_diri(MDRequest *mdr, CDentry *dn, class EMetaBlob *blob); void dirty_dn_diri(MDRequest *mdr, CDentry *dn, version_t dirpv); + void predirty_nested(MDRequest *mdr, EMetaBlob *blob, CInode *in, CDir *parent); // requests on existing inodes. diff --git a/src/messages/MClientReply.h b/src/messages/MClientReply.h index 9c6ce8b23b5..415e109bd1e 100644 --- a/src/messages/MClientReply.h +++ b/src/messages/MClientReply.h @@ -104,7 +104,7 @@ struct InodeStat { inodeno_t ino; version_t version; ceph_file_layout layout; - utime_t ctime, mtime, atime, nested_ctime; + utime_t ctime, mtime, atime, rctime; unsigned mode, uid, gid, nlink, rdev; loff_t size, max_size, nested_size; version_t time_warp_seq; @@ -129,7 +129,6 @@ struct InodeStat { ctime.decode_timeval(&e.ctime); mtime.decode_timeval(&e.mtime); atime.decode_timeval(&e.atime); - nested.nested_ctime.decode_timeval(&e.nested_ctime); time_warp_seq = e.time_warp_seq; mode = e.mode; uid = e.uid; @@ -138,10 +137,10 @@ struct InodeStat { size = e.size; max_size = e.max_size; rdev = e.rdev; - nested.nested_size = e.nested_size; - nested_ctime.decode_timeval(&e.nested_ctime); - nested_size = e.nested_size; + nested.rctime.decode_timeval(&e.rctime); + nested.rbytes = e.rbytes; + nested.rfiles = e.rfiles; int n = e.fragtree.nsplits; while (n) { @@ -170,7 +169,6 @@ struct InodeStat { in->inode.ctime.encode_timeval(&e.ctime); in->inode.mtime.encode_timeval(&e.mtime); in->inode.atime.encode_timeval(&e.atime); - in->inode.nested.nested_ctime.encode_timeval(&e.nested_ctime); e.time_warp_seq = in->inode.time_warp_seq; e.mode = in->inode.mode; e.uid = in->inode.uid; @@ -178,7 +176,11 @@ struct InodeStat { e.nlink = in->inode.nlink; e.size = in->inode.size; e.max_size = in->inode.max_size; - e.nested_size = in->inode.nested.nested_size; + + in->inode.nested.rctime.encode_timeval(&e.rctime); + e.rbytes = in->inode.nested.rbytes; + e.rfiles = in->inode.nested.rfiles; + e.rdev = in->inode.rdev; e.fragtree.nsplits = in->dirfragtree._splits.size(); ::encode(e, bl);