- dentry versions vs dirfrags...
- failure during reconnect vs clientmap.
+- make truncate faster with a trunc_seq, attached to objects as attributes?
+
- inode.rctime (recursive mtime)?
- make inode.size reflect directory size (number of entries)?
__le32 mode, uid, gid;
__le32 nlink;
__le64 size, max_size;
- __le64 nested_size;
- struct ceph_timespec nested_ctime;
+ __le64 rbytes, rfiles;
+ struct ceph_timespec rctime;
__le32 rdev;
struct ceph_frag_tree_head fragtree;
} __attribute__ ((packed));
};
struct nested_info_t {
- uint64_t nested_size; // \sum_{children}(size + nested_size)
- utime_t nested_ctime; // \max_{children}(ctime, nested_ctime)
+ utime_t rctime; // \max_{children}(ctime, nested_ctime)
+ __u64 rbytes;
+ __u64 rfiles;
void encode(bufferlist &bl) const {
- ::encode(nested_size, bl);
- ::encode(nested_ctime, bl);
+ ::encode(rbytes, bl);
+ ::encode(rfiles, bl);
+ ::encode(rctime, bl);
}
void decode(bufferlist::iterator &bl) {
- ::decode(nested_size, bl);
- ::decode(nested_ctime, bl);
+ ::decode(rbytes, bl);
+ ::decode(rfiles, bl);
+ ::decode(rctime, bl);
}
};
WRITE_CLASS_ENCODER(nested_info_t)
uint64_t time_warp_seq; // count of (potential) mtime/atime timewarps (i.e., utimes())
// dirfrag, recursive accounting
- nested_info_t nested; // inline summation
+ nested_info_t accounted_nested; // what dirfrag has seen
+ nested_info_t nested; // inline summation for child dirfrags.
+ /*
+ * if accounted_nested does not match nested, the parent dirfrag needs to be
+ * adjusted by the difference.
+ */
// special stuff
version_t version; // auth only
// projected values (only defined while dirty)
list<inode_t*> projected_inode;
- list<fragtree_t> projected_dirfragtree;
version_t get_projected_version() {
if (projected_inode.empty())
}
-
+// nested ---------------------------------------------------------------
void scatter_tempsync(ScatterLock *lock);
bool scatter_rdlock_start(ScatterLock *lock, MDRequest *mdr);
void scatter_rdlock_finish(ScatterLock *lock, MDRequest *mdr);
- bool scatter_wrlock_start(ScatterLock *lock, MDRequest *mdr);
+public:
+ bool scatter_wrlock_start(ScatterLock *lock, MDRequest *mdr); // public for Server's predirty_nested
+protected:
void scatter_wrlock_finish(ScatterLock *lock, MDRequest *mdr);
void scatter_writebehind(ScatterLock *lock);
bool committing;
bool aborted;
+ // for applying projected inode changes
+ list<CInode*> projected_inodes;
+
// break rarely-used fields into a separately allocated structure
// to save memory for most ops
struct More {
}
auth_pins.clear();
}
+
+ void add_projected_inode(CInode *in) {
+ projected_inodes.push_back(in);
+ }
+ void pop_and_dirty_projected_inodes() {
+ while (!projected_inodes.empty()) {
+ CInode *in = projected_inodes.front();
+ projected_inodes.pop_front();
+ in->pop_and_dirty_projected_inode(ls);
+ }
+ }
};
inline ostream& operator<<(ostream& out, MDRequest &mdr)
}
}
+void Server::predirty_nested(MDRequest *mdr, EMetaBlob *blob, CInode *in, CDir *parent)
+{
+ if (!parent)
+ parent = in->get_parent_dir();
+
+ // initial diff from *in
+ inode_t *curi = in->get_projected_inode();
+ __u64 drbytes;
+ __u64 drfiles;
+ utime_t rctime;
+ if (in->is_dir()) {
+ drbytes = curi->nested.rbytes - curi->accounted_nested.rbytes;
+ drfiles = curi->nested.rfiles - curi->accounted_nested.rfiles;
+ rctime = MAX(curi->ctime, curi->nested.rctime);
+ } else {
+ drbytes = curi->size - curi->accounted_nested.rbytes;
+ drfiles = 1 - curi->accounted_nested.rfiles;
+ rctime = curi->ctime;
+ }
+
+ blob->add_dir_context(in->get_parent_dir());
+
+ // build list of inodes to wrlock, dirty, and update
+ list<CInode*> ls;
+ CInode *cur = in;
+ while (parent) {
+ assert(cur->is_auth());
+ assert(parent->is_auth());
+
+ // opportunistically adjust parent dirfrag
+ CInode *pin = parent->get_inode();
+ if (!pin->dirlock.can_wrlock()) {
+ dout(10) << " can't wrlock " << pin->dirlock << " on " << *pin << dendl;
+ break;
+ }
+ bool r = mds->locker->scatter_wrlock_start(&pin->dirlock, mdr);
+ assert(r);
+
+ if (!pin->is_auth()) {
+ break;
+ }
+
+ // project update
+ version_t ppv = pin->pre_dirty();
+ inode_t *pi = pin->project_inode();
+ pi->version = ppv;
+ pi->nested.rbytes += drbytes;
+ pi->nested.rfiles += drfiles;
+ pi->nested.rctime = rctime;
+ mdr->add_projected_inode(pin);
+ ls.push_back(pin);
+
+ frag_t fg = parent->dirfrag().frag;
+ pin->dirfrag_nested[fg].rbytes += drbytes;
+ pin->dirfrag_nested[fg].rfiles += drfiles;
+ pin->dirfrag_nested[fg].rctime = rctime;
+
+ curi->accounted_nested.rbytes += drbytes;
+ curi->accounted_nested.rfiles += drfiles;
+ curi->accounted_nested.rctime = rctime;
+
+ cur = pin;
+ curi = pi;
+ parent = cur->get_parent_dir();
+ }
+
+ // now, stick it in the blob
+ for (list<CInode*>::iterator p = ls.begin();
+ p != ls.end();
+ p++) {
+ CInode *cur = *p;
+ inode_t *pi = cur->get_projected_inode();
+ blob->add_primary_dentry(cur->get_parent_dn(), true, 0, pi);
+ }
+}
newi->symlink = req->get_path2();
newi->inode.size = newi->symlink.length();
newi->inode.version = dn->pre_dirty() - 1;
-
+
// prepare finisher
mdr->ls = mdlog->get_current_segment();
EUpdate *le = new EUpdate(mdlog, "symlink");
le->metablob.add_allocated_ino(newi->ino(), mds->idalloc->get_version());
version_t dirpv = predirty_dn_diri(mdr, dn, &le->metablob); // dir mtime too
le->metablob.add_dir_context(dn->dir);
+ predirty_nested(mdr, &le->metablob, newi, dn->dir);
le->metablob.add_primary_dentry(dn, true, newi, &newi->inode);
-
+
// log + wait
mdlog->submit_entry(le, new C_MDS_mknod_finish(mds, mdr, dn, newi, dirpv));
}
version_t predirty_dn_diri(MDRequest *mdr, CDentry *dn, class EMetaBlob *blob);
void dirty_dn_diri(MDRequest *mdr, CDentry *dn, version_t dirpv);
+ void predirty_nested(MDRequest *mdr, EMetaBlob *blob, CInode *in, CDir *parent);
// requests on existing inodes.
inodeno_t ino;
version_t version;
ceph_file_layout layout;
- utime_t ctime, mtime, atime, nested_ctime;
+ utime_t ctime, mtime, atime, rctime;
unsigned mode, uid, gid, nlink, rdev;
loff_t size, max_size, nested_size;
version_t time_warp_seq;
ctime.decode_timeval(&e.ctime);
mtime.decode_timeval(&e.mtime);
atime.decode_timeval(&e.atime);
- nested.nested_ctime.decode_timeval(&e.nested_ctime);
time_warp_seq = e.time_warp_seq;
mode = e.mode;
uid = e.uid;
size = e.size;
max_size = e.max_size;
rdev = e.rdev;
- nested.nested_size = e.nested_size;
- nested_ctime.decode_timeval(&e.nested_ctime);
- nested_size = e.nested_size;
+ nested.rctime.decode_timeval(&e.rctime);
+ nested.rbytes = e.rbytes;
+ nested.rfiles = e.rfiles;
int n = e.fragtree.nsplits;
while (n) {
in->inode.ctime.encode_timeval(&e.ctime);
in->inode.mtime.encode_timeval(&e.mtime);
in->inode.atime.encode_timeval(&e.atime);
- in->inode.nested.nested_ctime.encode_timeval(&e.nested_ctime);
e.time_warp_seq = in->inode.time_warp_seq;
e.mode = in->inode.mode;
e.uid = in->inode.uid;
e.nlink = in->inode.nlink;
e.size = in->inode.size;
e.max_size = in->inode.max_size;
- e.nested_size = in->inode.nested.nested_size;
+
+ in->inode.nested.rctime.encode_timeval(&e.rctime);
+ e.rbytes = in->inode.nested.rbytes;
+ e.rfiles = in->inode.nested.rfiles;
+
e.rdev = in->inode.rdev;
e.fragtree.nsplits = in->dirfragtree._splits.size();
::encode(e, bl);