From a296989b4d3a9291743b1211f6f770e9eb465b57 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 6 May 2008 15:06:47 -0700 Subject: [PATCH] introduce time_warp_seq to EXCL utimes() + cap release vs file_cap/fill_inode races --- src/client/Client.cc | 23 ++++-- src/include/ceph_fs.h | 2 + src/include/types.h | 1 + src/kernel/inode.c | 130 +++++++++++++++++++++------------ src/kernel/mds_client.c | 11 ++- src/kernel/super.c | 1 + src/kernel/super.h | 1 + src/mds/Locker.cc | 8 +- src/mds/Server.cc | 1 + src/messages/MClientFileCaps.h | 2 + src/messages/MClientReply.h | 3 + 11 files changed, 125 insertions(+), 58 deletions(-) diff --git a/src/client/Client.cc b/src/client/Client.cc index abdf3d317e632..7f86168b7afa4 100644 --- a/src/client/Client.cc +++ b/src/client/Client.cc @@ -340,10 +340,13 @@ void Client::update_inode(Inode *in, InodeStat *st, LeaseStat *lease, utime_t fr if ((issued & CEPH_CAP_EXCL) == 0) { if ((uint64_t)st->size > in->inode.size) in->inode.size = st->size; - if (st->mtime > in->inode.mtime) - in->inode.mtime = st->mtime; - if (st->atime > in->inode.atime) - in->inode.atime = st->atime; + if (st->time_warp_seq >= in->inode.time_warp_seq) { + if (st->mtime > in->inode.mtime) + in->inode.mtime = st->mtime; + if (st->atime > in->inode.atime) + in->inode.atime = st->atime; + in->inode.time_warp_seq = st->time_warp_seq; + } } } else { in->inode.size = st->size; @@ -1435,10 +1438,13 @@ void Client::handle_file_caps(MClientFileCaps *m) // update inode if (m->get_size() > in->inode.size) in->inode.size = m->get_size(); - if (m->get_mtime() > in->inode.mtime && (old_caps & CEPH_CAP_EXCL) == 0) - in->inode.mtime = m->get_mtime(); - if (m->get_atime() > in->inode.atime && (old_caps & CEPH_CAP_EXCL) == 0) - in->inode.atime = m->get_atime(); + if (m->get_time_warp_seq() >= in->inode.time_warp_seq) { + if (m->get_mtime() > in->inode.mtime && (old_caps & CEPH_CAP_EXCL) == 0) + in->inode.mtime = m->get_mtime(); + if (m->get_atime() > in->inode.atime && (old_caps & CEPH_CAP_EXCL) == 0) + in->inode.atime = m->get_atime(); + in->inode.time_warp_seq = m->get_time_warp_seq(); + } // share our (possibly newer) file size, mtime, atime m->set_size(in->inode.size); @@ -2279,6 +2285,7 @@ int Client::_utimes(const filepath &path, utime_t mtime, utime_t atime, bool fol if (dn && dn->inode && (dn->inode->file_caps() & want) == want) { dout(5) << " have WR and EXCL caps, just updating our m/atime" << dendl; + dn->inode->inode.time_warp_seq++; dn->inode->inode.mtime = mtime; dn->inode->inode.atime = atime; return 0; diff --git a/src/include/ceph_fs.h b/src/include/ceph_fs.h index 598337cdafdad..08c4188642792 100644 --- a/src/include/ceph_fs.h +++ b/src/include/ceph_fs.h @@ -495,6 +495,7 @@ struct ceph_mds_reply_inode { __le64 version; struct ceph_file_layout layout; struct ceph_timespec ctime, mtime, atime; + __le64 time_warp_seq; __le32 mode, uid, gid; __le32 nlink; __le64 size, max_size; @@ -592,6 +593,7 @@ struct ceph_mds_file_caps { __le64 size, max_size; __le32 migrate_mds, migrate_seq; struct ceph_timespec mtime, atime, ctime; + __le64 time_warp_seq; } __attribute__ ((packed)); diff --git a/src/include/types.h b/src/include/types.h index 94b9599e539a1..d187261ed8c10 100644 --- a/src/include/types.h +++ b/src/include/types.h @@ -203,6 +203,7 @@ struct inode_t { utime_t mtime; // file data modify time. utime_t atime; // file data access time. utime_t rmtime; // recursive mtime? + uint64_t time_warp_seq; // count of (potential) mtime/atime timewarps (i.e., utimes()) // special stuff version_t version; // auth only diff --git a/src/kernel/inode.c b/src/kernel/inode.c index 013c8a2e47fec..9b67ffe59cf78 100644 --- a/src/kernel/inode.c +++ b/src/kernel/inode.c @@ -78,6 +78,7 @@ int ceph_fill_inode(struct inode *inode, struct ceph_mds_reply_inode *info) u32 su = le32_to_cpu(info->layout.fl_stripe_unit); int blkbits = fls(su) - 1; u64 blocks = (size + (1<<9) - 1) >> 9; + u64 time_warp_seq; dout(30, "fill_inode %p ino %llx by %d.%d sz=%llu mode %o nlink %d\n", inode, info->ino, inode->i_uid, inode->i_gid, @@ -102,29 +103,49 @@ int ceph_fill_inode(struct inode *inode, struct ceph_mds_reply_inode *info) ceph_decode_timespec(&atime, &info->atime); ceph_decode_timespec(&mtime, &info->mtime); ceph_decode_timespec(&ctime, &info->ctime); + time_warp_seq = le64_to_cpu(info->time_warp_seq); + issued = __ceph_caps_issued(ci); if (issued & CEPH_CAP_EXCL) { if (timespec_compare(&ctime, &inode->i_ctime) > 0) inode->i_ctime = ctime; + if (time_warp_seq > ci->i_time_warp_seq) + derr(0, "WARNING: %p mds time_warp_seq %llu > %llu\n", + inode, time_warp_seq, ci->i_time_warp_seq); } else if (issued & (CEPH_CAP_WR|CEPH_CAP_WRBUFFER)) { if (size > inode->i_size) { inode->i_size = size; inode->i_blkbits = blkbits; inode->i_blocks = blocks; } - if (timespec_compare(&mtime, &inode->i_mtime) > 0) - inode->i_mtime = mtime; - if (timespec_compare(&atime, &inode->i_atime) > 0) - inode->i_atime = atime; - if (timespec_compare(&ctime, &inode->i_ctime) > 0) - inode->i_ctime = ctime; + if (time_warp_seq >= ci->i_time_warp_seq) { + if (timespec_compare(&mtime, &inode->i_mtime) > 0) + inode->i_mtime = mtime; + if (timespec_compare(&atime, &inode->i_atime) > 0) + inode->i_atime = atime; + if (timespec_compare(&ctime, &inode->i_ctime) > 0) + inode->i_ctime = ctime; + ci->i_time_warp_seq = time_warp_seq; + } else + dout(10, " mds time_warp_seq %llu < %llu\n", + time_warp_seq, ci->i_time_warp_seq); } else { + if (timespec_compare(&mtime, &inode->i_mtime) != 0) + dout(10, "%p issued %d .. full update, " + "mtime %lu.%lu -> %lu.%lu\n", inode, issued, + inode->i_mtime.tv_sec, inode->i_mtime.tv_nsec, + mtime.tv_sec, mtime.tv_nsec); inode->i_size = size; inode->i_blkbits = blkbits; inode->i_blocks = blocks; - inode->i_mtime = mtime; - inode->i_atime = atime; - inode->i_ctime = ctime; + if (time_warp_seq >= ci->i_time_warp_seq) { + inode->i_mtime = mtime; + inode->i_atime = atime; + inode->i_ctime = ctime; + ci->i_time_warp_seq = time_warp_seq; + } else + dout(10, " mds time_warp_seq %llu < %llu\n", + time_warp_seq, ci->i_time_warp_seq); } /* ceph inode */ @@ -410,7 +431,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, ceph_init_dentry(dn); /* just in case */ req->r_last_dentry = NULL; if (req->r_old_dentry) { - dout(10, " src %p '%.*s' dst %p '%.*s'\n", + dout(10, " src %p '%.*s' dst %p '%.*s'\n", req->r_old_dentry, req->r_old_dentry->d_name.len, req->r_old_dentry->d_name.name, @@ -418,7 +439,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, dout(10, "fill_trace doing d_move %p -> %p\n", req->r_old_dentry, dn); d_move(req->r_old_dentry, dn); - dout(10, " src %p '%.*s' dst %p '%.*s'\n", + dout(10, " src %p '%.*s' dst %p '%.*s'\n", req->r_old_dentry, req->r_old_dentry->d_name.len, req->r_old_dentry->d_name.name, @@ -917,11 +938,12 @@ int ceph_handle_cap_grant(struct inode *inode, struct ceph_mds_file_caps *grant, struct ceph_inode_info *ci = ceph_inode(inode); int mds = session->s_mds; int seq = le32_to_cpu(grant->seq); - int newcaps; + int newcaps = le32_to_cpu(grant->caps); int used; int issued; /* to me, before */ int wanted; int reply = 0; + u64 time_warp_seq; u64 size = le64_to_cpu(grant->size); u64 max_size = le64_to_cpu(grant->max_size); struct timespec mtime, atime, ctime; @@ -968,30 +990,45 @@ int ceph_handle_cap_grant(struct inode *inode, struct ceph_mds_file_caps *grant, wake = 1; } + /* ctime? */ + ceph_decode_timespec(&ctime, &grant->ctime); + if (timespec_compare(&ctime, &inode->i_ctime) > 0) { + dout(10, "ctime %lu.%09ld -> %lu.%.09ld\n", + ctime.tv_sec, ctime.tv_nsec, + inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec); + inode->i_ctime = ctime; + } + /* mtime/atime? */ issued = __ceph_caps_issued(ci); - if ((issued & CEPH_CAP_EXCL) == 0) { + time_warp_seq = le64_to_cpu(grant->time_warp_seq); + if (issued & CEPH_CAP_EXCL) { + if (time_warp_seq > ci->i_time_warp_seq) + derr(0, "WARNING: %p mds time_warp_seq %llu > %llu\n", + inode, time_warp_seq, ci->i_time_warp_seq); + } else { ceph_decode_timespec(&mtime, &grant->mtime); ceph_decode_timespec(&atime, &grant->atime); - ceph_decode_timespec(&ctime, &grant->ctime); - if (timespec_compare(&mtime, &inode->i_mtime) > 0) { - dout(10, "mtime %lu.%09ld -> %lu.%.09ld\n", - mtime.tv_sec, mtime.tv_nsec, - inode->i_mtime.tv_sec, inode->i_mtime.tv_nsec); - inode->i_mtime = mtime; - } - if (timespec_compare(&ctime, &inode->i_ctime) > 0) { - dout(10, "ctime %lu.%09ld -> %lu.%.09ld\n", - ctime.tv_sec, ctime.tv_nsec, - inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec); - inode->i_ctime = ctime; - } - if (timespec_compare(&atime, &inode->i_atime) > 0) { - dout(10, "atime %lu.%09ld -> %lu.%09ld\n", - atime.tv_sec, atime.tv_nsec, - inode->i_atime.tv_sec, inode->i_atime.tv_nsec); - inode->i_atime = atime; - } + if (time_warp_seq >= ci->i_time_warp_seq) { + if (timespec_compare(&mtime, &inode->i_mtime) > 0) { + dout(10, "%p mtime %lu.%09ld -> %lu.%.09ld\n", + inode, mtime.tv_sec, mtime.tv_nsec, + inode->i_mtime.tv_sec, + inode->i_mtime.tv_nsec); + inode->i_mtime = mtime; + } + if (timespec_compare(&atime, &inode->i_atime) > 0) { + dout(10, "%p atime %lu.%09ld -> %lu.%09ld\n", + inode, + atime.tv_sec, atime.tv_nsec, + inode->i_atime.tv_sec, + inode->i_atime.tv_nsec); + inode->i_atime = atime; + } + ci->i_time_warp_seq = time_warp_seq; + } else + dout(10, " mds time_warp_seq %llu < %llu\n", + time_warp_seq, ci->i_time_warp_seq); } /* check cap bits */ @@ -1007,7 +1044,6 @@ int ceph_handle_cap_grant(struct inode *inode, struct ceph_mds_file_caps *grant, cap->seq = seq; /* revocation? */ - newcaps = le32_to_cpu(grant->caps); if (cap->issued & ~newcaps) { dout(10, "revocation: %d -> %d\n", cap->issued, newcaps); if ((cap->issued & ~newcaps) & CEPH_CAP_RDCACHE) @@ -1021,6 +1057,7 @@ int ceph_handle_cap_grant(struct inode *inode, struct ceph_mds_file_caps *grant, grant->max_size = 0; /* don't re-request */ ceph_encode_timespec(&grant->mtime, &inode->i_mtime); ceph_encode_timespec(&grant->atime, &inode->i_atime); + grant->time_warp_seq = cpu_to_le64(ci->i_time_warp_seq); reply = 1; } cap->issued = newcaps; @@ -1070,7 +1107,7 @@ static int apply_truncate(struct inode *inode, loff_t size) { struct ceph_inode_info *ci = ceph_inode(inode); int rc; - + rc = vmtruncate(inode, size); if (rc == 0) { spin_lock(&inode->i_lock); @@ -1090,7 +1127,7 @@ void ceph_vmtruncate_work(struct work_struct *work) struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info, i_vmtruncate_work); struct inode *inode = &ci->vfs_inode; - + dout(10, "vmtruncate_work %p\n", inode); mutex_lock(&inode->i_mutex); __ceph_do_pending_vmtruncate(inode); @@ -1142,7 +1179,7 @@ int ceph_handle_cap_trunc(struct inode *inode, struct ceph_mds_file_caps *trunc, if (ci->i_vmtruncate_to < 0 && size > inode->i_size) dout(10, "clean fwd truncate, no vmtruncate needed\n"); else if (ci->i_vmtruncate_to >= 0 && size >= ci->i_vmtruncate_to) - dout(10, "trunc to %lld < %lld already queued\n", + dout(10, "trunc to %lld < %lld already queued\n", ci->i_vmtruncate_to, size); else { /* we need to trunc even smaller */ @@ -1376,6 +1413,7 @@ static int ceph_setattr_time(struct dentry *dentry, struct iattr *attr) /* if i hold CAP_EXCL, i can change [am]time any way i like */ if (ceph_caps_issued(ci) & CEPH_CAP_EXCL) { dout(10, "utime holding EXCL, doing locally\n"); + ci->i_time_warp_seq++; if (ia_valid & ATTR_ATIME) inode->i_atime = attr->ia_atime; if (ia_valid & ATTR_MTIME) @@ -1488,29 +1526,31 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) /* gratuitous debug output */ if (ia_valid & ATTR_UID) - dout(10, "setattr: uid %d -> %d\n", inode->i_uid, attr->ia_uid); + dout(10, "setattr: %p uid %d -> %d\n", inode, + inode->i_uid, attr->ia_uid); if (ia_valid & ATTR_GID) - dout(10, "setattr: gid %d -> %d\n", inode->i_uid, attr->ia_uid); + dout(10, "setattr: %p gid %d -> %d\n", inode, + inode->i_uid, attr->ia_uid); if (ia_valid & ATTR_MODE) - dout(10, "setattr: mode %o -> %o\n", inode->i_mode, + dout(10, "setattr: %p mode %o -> %o\n", inode, inode->i_mode, attr->ia_mode); if (ia_valid & ATTR_SIZE) - dout(10, "setattr: size %lld -> %lld\n", inode->i_size, - attr->ia_size); + dout(10, "setattr: %p size %lld -> %lld\n", inode, + inode->i_size, attr->ia_size); if (ia_valid & ATTR_ATIME) - dout(10, "setattr: atime %ld.%ld -> %ld.%ld\n", + dout(10, "setattr: %p atime %ld.%ld -> %ld.%ld\n", inode, inode->i_atime.tv_sec, inode->i_atime.tv_nsec, attr->ia_atime.tv_sec, attr->ia_atime.tv_nsec); if (ia_valid & ATTR_MTIME) - dout(10, "setattr: mtime %ld.%ld -> %ld.%ld\n", + dout(10, "setattr: %p mtime %ld.%ld -> %ld.%ld\n", inode, inode->i_mtime.tv_sec, inode->i_mtime.tv_nsec, attr->ia_mtime.tv_sec, attr->ia_mtime.tv_nsec); if (ia_valid & ATTR_MTIME) - dout(10, "setattr: ctime %ld.%ld -> %ld.%ld\n", + dout(10, "setattr: %p ctime %ld.%ld -> %ld.%ld\n", inode, inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec, attr->ia_ctime.tv_sec, attr->ia_ctime.tv_nsec); if (ia_valid & ATTR_FILE) - dout(10, "setattr: ATTR_FILE ... hrm!\n"); + dout(10, "setattr: %p ATTR_FILE ... hrm!\n", inode); /* chown */ if (ia_valid & (ATTR_UID|ATTR_GID)) { diff --git a/src/kernel/mds_client.c b/src/kernel/mds_client.c index 5310a0866fd16..5eae2f16fbab8 100644 --- a/src/kernel/mds_client.c +++ b/src/kernel/mds_client.c @@ -1354,7 +1354,7 @@ static void check_new_map(struct ceph_mds_client *mdsc, static void send_cap_ack(struct ceph_mds_client *mdsc, __u64 ino, int caps, int wanted, __u32 seq, __u64 size, __u64 max_size, struct timespec *mtime, struct timespec *atime, - int mds) + u64 time_warp_seq, int mds) { struct ceph_mds_file_caps *fc; struct ceph_msg *msg; @@ -1381,6 +1381,7 @@ static void send_cap_ack(struct ceph_mds_client *mdsc, __u64 ino, int caps, ceph_encode_timespec(&fc->mtime, mtime); if (atime) ceph_encode_timespec(&fc->atime, atime); + fc->time_warp_seq = cpu_to_le64(time_warp_seq); send_msg_mds(mdsc, msg, mds); } @@ -1435,7 +1436,7 @@ void ceph_mdsc_handle_filecaps(struct ceph_mds_client *mdsc, if (!inode) { dout(10, "wtf, i don't have ino %lu=%llx? closing out cap\n", inot, ino); - send_cap_ack(mdsc, ino, 0, 0, seq, size, 0, 0, 0, mds); + send_cap_ack(mdsc, ino, 0, 0, seq, size, 0, 0, 0, 0, mds); goto no_inode; } @@ -1497,7 +1498,7 @@ int __ceph_mdsc_send_cap(struct ceph_mds_client *mdsc, int revoking = cap->implemented & ~cap->issued; int dropping = cap->issued & ~wanted; int keep; - __u64 seq; + __u64 seq, time_warp_seq; __u64 size, max_size; struct timespec mtime, atime; int removed_last = 0; @@ -1517,6 +1518,7 @@ int __ceph_mdsc_send_cap(struct ceph_mds_client *mdsc, ci->i_requested_max_size = max_size; mtime = inode->i_mtime; atime = inode->i_atime; + time_warp_seq = ci->i_time_warp_seq; if (wanted == 0) { __ceph_remove_cap(cap); removed_last = list_empty(&ci->i_caps); @@ -1536,7 +1538,8 @@ int __ceph_mdsc_send_cap(struct ceph_mds_client *mdsc, send_cap_ack(mdsc, ceph_ino(inode), keep, wanted, seq, - size, max_size, &mtime, &atime, session->s_mds); + size, max_size, &mtime, &atime, time_warp_seq, + session->s_mds); if (wanted == 0) iput(inode); /* removed cap */ diff --git a/src/kernel/super.c b/src/kernel/super.c index d56a99494ca3f..94217abf99498 100644 --- a/src/kernel/super.c +++ b/src/kernel/super.c @@ -145,6 +145,7 @@ static struct inode *ceph_alloc_inode(struct super_block *sb) dout(10, "alloc_inode %p vfsi %p\n", ci, &ci->vfs_inode); ci->i_version = 0; + ci->i_time_warp_seq = 0; ci->i_symlink = 0; ci->i_lease_session = 0; diff --git a/src/kernel/super.h b/src/kernel/super.h index 2f599f30c1dc6..b169898819eb6 100644 --- a/src/kernel/super.h +++ b/src/kernel/super.h @@ -165,6 +165,7 @@ struct ceph_inode_info { u64 i_ceph_ino; u64 i_version; + u64 i_time_warp_seq; struct ceph_file_layout i_layout; char *i_symlink; diff --git a/src/mds/Locker.cc b/src/mds/Locker.cc index ae44e4c731056..74d8e40fa8a0d 100644 --- a/src/mds/Locker.cc +++ b/src/mds/Locker.cc @@ -603,7 +603,8 @@ bool Locker::issue_caps(CInode *in) if (seq > 0 && !cap->is_suppress()) { - dout(7) << " sending MClientFileCaps to client" << it->first << " seq " << cap->get_last_seq() + dout(7) << " sending MClientFileCaps to client" << it->first + << " seq " << cap->get_last_seq() << " new pending " << cap_string(cap->pending()) << " was " << cap_string(before) << dendl; mds->send_message_client(new MClientFileCaps(CEPH_CAP_OP_GRANT, @@ -1019,6 +1020,11 @@ void Locker::handle_client_file_caps(MClientFileCaps *m) << " for " << *in << dendl; pi->atime = atime; } + if (excl && pi->time_warp_seq < m->get_time_warp_seq()) { + dout(7) << " time_warp_seq " << pi->time_warp_seq << " -> " << m->get_time_warp_seq() + << " for " << *in << dendl; + pi->time_warp_seq = m->get_time_warp_seq(); + } le->metablob.add_dir_context(in->get_parent_dir()); le->metablob.add_primary_dentry(in->parent, true, 0, pi); LogSegment *ls = mds->mdlog->get_current_segment(); diff --git a/src/mds/Server.cc b/src/mds/Server.cc index cca420fd5c275..48c1c9a5340ca 100644 --- a/src/mds/Server.cc +++ b/src/mds/Server.cc @@ -1613,6 +1613,7 @@ void Server::handle_client_utime(MDRequest *mdr) pi->version = cur->pre_dirty(); pi->ctime = g_clock.real_now(); + pi->time_warp_seq++; // maybe not a timewarp, but still a serialization point. // log + wait mdr->ls = mdlog->get_current_segment(); diff --git a/src/messages/MClientFileCaps.h b/src/messages/MClientFileCaps.h index 8fd6654993e0d..e70cca4238936 100644 --- a/src/messages/MClientFileCaps.h +++ b/src/messages/MClientFileCaps.h @@ -46,6 +46,7 @@ class MClientFileCaps : public Message { utime_t get_ctime() { return utime_t(h.ctime); } utime_t get_mtime() { return utime_t(h.mtime); } utime_t get_atime() { return utime_t(h.atime); } + __u64 get_time_warp_seq() { return h.time_warp_seq; } // for cap migration int get_migrate_mds() { return h.migrate_mds; } @@ -84,6 +85,7 @@ class MClientFileCaps : public Message { inode.mtime.encode_timeval(&h.mtime); inode.atime.encode_timeval(&h.atime); inode.ctime.encode_timeval(&h.ctime); + h.time_warp_seq = inode.time_warp_seq; } const char *get_type_name() { return "Cfcap";} diff --git a/src/messages/MClientReply.h b/src/messages/MClientReply.h index 7c57d97ce25e2..1306bffdf4267 100644 --- a/src/messages/MClientReply.h +++ b/src/messages/MClientReply.h @@ -105,6 +105,7 @@ struct InodeStat { utime_t ctime, mtime, atime; unsigned mode, uid, gid, nlink, rdev; loff_t size, max_size; + version_t time_warp_seq; string symlink; // symlink content (if symlink) fragtree_t dirfragtree; @@ -124,6 +125,7 @@ struct InodeStat { ctime.decode_timeval(&e.ctime); mtime.decode_timeval(&e.mtime); atime.decode_timeval(&e.atime); + time_warp_seq = e.time_warp_seq; mode = e.mode; uid = e.uid; gid = e.gid; @@ -155,6 +157,7 @@ struct InodeStat { in->inode.ctime.encode_timeval(&e.ctime); in->inode.mtime.encode_timeval(&e.mtime); in->inode.atime.encode_timeval(&e.atime); + e.time_warp_seq = in->inode.time_warp_seq; e.mode = in->inode.mode; e.uid = in->inode.uid; e.gid = in->inode.gid; -- 2.39.5