From 2ba5f32dbea7ef23c2e8c79b1710606a933c9095 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Wed, 30 Jan 2008 16:46:30 -0800 Subject: [PATCH] improved max_size handling in mds, client. avoid spurious setattr/utime after open in client by ignoring sub-second mtime resolution... weird --- src/TODO | 3 +- src/include/ceph_fs.h | 4 +- src/kernel/inode.c | 28 ++++++++++++-- src/kernel/mds_client.c | 13 ++++--- src/kernel/super.h | 3 +- src/mds/CInode.h | 2 + src/mds/Locker.cc | 71 ++++++++++++++++++++++------------ src/mds/LogEvent.cc | 6 --- src/mds/LogEvent.h | 2 - src/mds/Server.cc | 1 + src/mds/events/EFileAccess.h | 51 ------------------------ src/mds/events/EFileWrite.h | 60 ---------------------------- src/mds/journal.cc | 45 --------------------- src/messages/MClientFileCaps.h | 5 ++- src/messages/MClientReply.h | 3 +- 15 files changed, 94 insertions(+), 203 deletions(-) delete mode 100644 src/mds/events/EFileAccess.h delete mode 100644 src/mds/events/EFileWrite.h diff --git a/src/TODO b/src/TODO index 8bbc4dd7d4d7b..1cb3eda3c2559 100644 --- a/src/TODO +++ b/src/TODO @@ -27,10 +27,11 @@ kernel client - audit use of kmalloc vs spinlocks - convert most everything in ceph_fs.h to le32/le64 notation, cleaning up userspace appropriately - vfs - - generate paths relative to the appropriate vfsmount root + - generate paths relative to the appropriate vfsmount root? - getattr should do an lstat? - d_revalidate? - test truncate + - open path needs to call fill_trace/fill_inode such that ci->i_max_size gets filled in - is ino_t really still 32 bits on i386?? hrm! - fix file open vs file_cap race - preemptively release caps as part of request if doing utimes/etc. on an open file? diff --git a/src/include/ceph_fs.h b/src/include/ceph_fs.h index 6f4109a295939..fba2ff2568a67 100644 --- a/src/include/ceph_fs.h +++ b/src/include/ceph_fs.h @@ -412,7 +412,7 @@ struct ceph_mds_reply_inode { struct ceph_timeval ctime, mtime, atime; __u32 mode, uid, gid; __u32 nlink; - __u64 size; + __u64 size, max_size; __u32 rdev; __u32 mask; struct ceph_frag_tree_head fragtree; @@ -449,7 +449,7 @@ struct ceph_mds_file_caps { __le32 seq; __le32 caps, wanted; __le64 ino; - __le64 size; + __le64 size, max_size; __le32 migrate_mds, migrate_seq; struct ceph_timeval mtime, atime; } __attribute__ ((packed)); diff --git a/src/kernel/inode.c b/src/kernel/inode.c index 1f844acb2e513..dc67030939c13 100644 --- a/src/kernel/inode.c +++ b/src/kernel/inode.c @@ -89,6 +89,8 @@ int ceph_fill_inode(struct inode *inode, struct ceph_mds_reply_inode *info) ci->i_old_atime = inode->i_atime; + ci->i_max_size = le64_to_cpu(info->max_size); + inode->i_mapping->a_ops = &ceph_aops; switch (inode->i_mode & S_IFMT) { @@ -399,9 +401,28 @@ int ceph_handle_cap_grant(struct inode *inode, struct ceph_mds_file_caps *grant, int used; int wanted = ceph_caps_wanted(ci); int ret = 0; + u64 size = le64_to_cpu(grant->size); + u64 max_size = le64_to_cpu(grant->max_size); dout(10, "handle_cap_grant inode %p ci %p mds%d seq %d\n", inode, ci, mds, seq); dout(10, " my wanted = %d\n", wanted); + dout(10, " size %llu max_size %llu\n", size, max_size); + + /* size change? */ + if (size != inode->i_size) { + /* FIXME: lock something here? */ + dout(10, "size %lld -> %llu\n", inode->i_size, size); + if (size < inode->i_size) { + /* FIXME: truncate page cache? */ + } + inode->i_size = size; + } + + /* max size increase? */ + if (max_size != ci->i_max_size) { + dout(10, "max_size %lld -> %llu\n", ci->i_max_size, max_size); + ci->i_max_size = max_size; + } cap = get_cap_for_mds(inode, mds); @@ -473,7 +494,7 @@ const struct inode_operations ceph_symlink_iops = { /* * generics */ -struct ceph_msg * prepare_setattr(struct ceph_mds_client *mdsc, struct dentry *dentry, int op) +struct ceph_msg *prepare_setattr(struct ceph_mds_client *mdsc, struct dentry *dentry, int op) { char *path; int pathlen; @@ -561,9 +582,10 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) //if (err) return err; } - /* FIXME: does getattr get set to do regular mtime updates? */ /* utimes */ - if (ia_valid & (ATTR_ATIME|ATTR_MTIME)) { + /* FIXME: second resolution here is a hack to avoid setattr on open... :/ */ + if (((ia_valid & ATTR_ATIME) && inode->i_atime.tv_sec != attr->ia_atime.tv_sec) || + ((ia_valid & ATTR_MTIME) && inode->i_mtime.tv_sec != attr->ia_mtime.tv_sec)) { req = prepare_setattr(mdsc, dentry, CEPH_MDS_OP_UTIME); if (IS_ERR(req)) return PTR_ERR(req); diff --git a/src/kernel/mds_client.c b/src/kernel/mds_client.c index 15844806081c0..baa91801a70d1 100644 --- a/src/kernel/mds_client.c +++ b/src/kernel/mds_client.c @@ -1030,7 +1030,7 @@ void check_new_map(struct ceph_mds_client *mdsc, /* caps */ void send_cap_ack(struct ceph_mds_client *mdsc, __u64 ino, int caps, int wanted, - __u32 seq, __u64 size, int mds) + __u32 seq, __u64 size, __u64 max_size, int mds) { struct ceph_mds_file_caps *fc; struct ceph_msg *msg; @@ -1049,6 +1049,7 @@ void send_cap_ack(struct ceph_mds_client *mdsc, __u64 ino, int caps, int wanted, fc->wanted = cpu_to_le32(wanted); fc->ino = cpu_to_le64(ino); fc->size = cpu_to_le64(size); + fc->max_size = cpu_to_le64(max_size); send_msg_mds(mdsc, msg, mds); } @@ -1063,7 +1064,7 @@ void ceph_mdsc_handle_filecaps(struct ceph_mds_client *mdsc, struct ceph_msg *ms int mds = msg->hdr.src.name.num; int op; u32 seq; - u64 ino, size; + u64 ino, size, max_size; ino_t inot; dout(10, "handle_filecaps from mds%d\n", mds); @@ -1075,7 +1076,8 @@ void ceph_mdsc_handle_filecaps(struct ceph_mds_client *mdsc, struct ceph_msg *ms op = le32_to_cpu(h->op); ino = le64_to_cpu(h->ino); seq = le32_to_cpu(h->seq); - size = le64_to_cpu(h->seq); + size = le64_to_cpu(h->size); + max_size = le64_to_cpu(h->max_size); /* find session */ session = __get_session(&client->mdsc, mds); @@ -1098,7 +1100,7 @@ void ceph_mdsc_handle_filecaps(struct ceph_mds_client *mdsc, struct ceph_msg *ms } if (!inode) { dout(10, "hrm, wtf, i don't have ino %lu=%llx? closing out cap\n", inot, ino); - send_cap_ack(mdsc, ino, 0, 0, seq, size, mds); + send_cap_ack(mdsc, ino, 0, 0, seq, size, max_size, mds); return; } @@ -1142,7 +1144,8 @@ int ceph_mdsc_update_cap_wanted(struct ceph_inode_info *ci, int wanted) cap->caps &= wanted; /* drop caps we don't want */ send_cap_ack(mdsc, ceph_ino(&ci->vfs_inode), cap->caps, wanted, - cap->seq, ci->vfs_inode.i_size, cap->mds); + cap->seq, ci->vfs_inode.i_size, ci->i_max_size, + cap->mds); } ci->i_cap_wanted = wanted; diff --git a/src/kernel/super.h b/src/kernel/super.h index 68fb5cf9ba149..cc4a650d7fe25 100644 --- a/src/kernel/super.h +++ b/src/kernel/super.h @@ -141,7 +141,8 @@ struct ceph_inode_info { int i_nr_by_mode[4]; int i_cap_wanted; - loff_t i_wr_size; + loff_t i_max_size; /* size authorized by mds */ + loff_t i_wr_size; /* largest offset we've written (+1) */ struct timespec i_wr_mtime; struct timespec i_old_atime; diff --git a/src/mds/CInode.h b/src/mds/CInode.h index 8181be69c66ec..f78116203e85c 100644 --- a/src/mds/CInode.h +++ b/src/mds/CInode.h @@ -69,6 +69,7 @@ class CInode : public MDSCacheObject { static const int PIN_FREEZING = 13; static const int PIN_FROZEN = 14; static const int PIN_IMPORTINGCAPS = 15; + static const int PIN_FILEUPDATE = -16; const char *pin_name(int p) { switch (p) { @@ -86,6 +87,7 @@ class CInode : public MDSCacheObject { case PIN_FREEZING: return "freezing"; case PIN_FROZEN: return "frozen"; case PIN_IMPORTINGCAPS: return "importingcaps"; + case PIN_FILEUPDATE: return "fileupdate"; default: return generic_pin_name(p); } } diff --git a/src/mds/Locker.cc b/src/mds/Locker.cc index fa70fe7536cd1..56e6b63ab528b 100644 --- a/src/mds/Locker.cc +++ b/src/mds/Locker.cc @@ -27,8 +27,6 @@ #include "events/EString.h" #include "events/EUpdate.h" -#include "events/EFileWrite.h" -#include "events/EFileAccess.h" #include "msg/Messenger.h" @@ -469,7 +467,7 @@ Capability* Locker::issue_new_caps(CInode *in, cap->set_wanted(my_want); cap->set_suppress(true); // suppress file cap messages for new cap (we'll bundle with the open() reply) } else { - // make sure it has sufficient caps + // make sure it wants sufficient caps if (my_want & ~cap->wanted()) { // augment wanted caps for this client cap->set_wanted(cap->wanted() | my_want); @@ -707,6 +705,19 @@ void Locker::handle_inode_file_caps(MInodeFileCaps *m) } + +struct C_Locker_FileUpdate_finish : public Context { + Locker *locker; + CInode *in; + LogSegment *ls; + C_Locker_FileUpdate_finish(Locker *l, CInode *i, LogSegment *s) : + locker(l), in(i), ls(s) {} + void finish(int r) { + in->pop_and_dirty_projected_inode(ls); + in->put(CInode::PIN_FILEUPDATE); + } +}; + /* * note: we only get these from the client if * - we are calling back previously issued caps (fewer than the client previously had) @@ -774,8 +785,6 @@ void Locker::handle_client_file_caps(MClientFileCaps *m) << " new_wanted " << cap_string(new_wanted) << dendl; if ((old_wanted & (CEPH_CAP_WR|CEPH_CAP_WRBUFFER|CEPH_CAP_WREXTEND)) && !(new_wanted & (CEPH_CAP_WR|CEPH_CAP_WRBUFFER|CEPH_CAP_WREXTEND))) { - dout(7) << " last wr-wanted cap, adjusting max_size" << dendl; - in->inode.max_size = 0; last_wr = true; } @@ -788,34 +797,46 @@ void Locker::handle_client_file_caps(MClientFileCaps *m) } } - // merge in atime? - if (m->get_atime() > in->inode.atime) { - dout(7) << " taking atime " << m->get_atime() << " > " - << in->inode.atime << " for " << *in << dendl; - in->inode.atime = m->get_atime(); - mds->mdlog->submit_entry(new EFileAccess(mds->mdlog, in)); - } - - // mtime|size? + // atime|mtime|size? bool dirty = false; + if (m->get_atime() > in->inode.atime) + dirty = true; if ((has|had) & CEPH_CAP_WR) { - // mtime + if (m->get_mtime() > in->inode.mtime) + dirty = true; + if (m->get_size() > in->inode.size) + dirty = true; + } + if ((dirty || last_wr) && + !in->is_base()) { // FIXME.. what about root inode mtime/atime? + EUpdate *le = new EUpdate(mds->mdlog, "size|max_size|mtime|atime update"); + inode_t *pi = in->project_inode(); + pi->version = in->pre_dirty(); + if (last_wr) { + dout(7) << " last wr-wanted cap, max_size=0" << dendl; + pi->max_size = 0; + } if (m->get_mtime() > in->inode.mtime) { dout(7) << " taking mtime " << m->get_mtime() << " > " - << in->inode.mtime << " for " << *in << dendl; - in->inode.mtime = m->get_mtime(); - dirty = true; + << in->inode.mtime << " for " << *in << dendl; + pi->mtime = m->get_mtime(); } - // size - if ((loff_t)m->get_size() > in->inode.size) { + if (m->get_size() > in->inode.size) { dout(7) << " taking size " << m->get_size() << " > " - << in->inode.size << " for " << *in << dendl; - in->inode.size = m->get_size(); - dirty = true; + << in->inode.size << " for " << *in << dendl; + pi->size = m->get_size(); + } + if (m->get_atime() > in->inode.atime) { + dout(7) << " taking atime " << m->get_atime() << " > " + << in->inode.atime << " for " << *in << dendl; + pi->atime = m->get_atime(); } + le->metablob.add_dir_context(in->get_parent_dir()); + le->metablob.add_primary_dentry(in->parent, true, 0, pi); + LogSegment *ls = mds->mdlog->get_current_segment(); + in->get(CInode::PIN_FILEUPDATE); + mds->mdlog->submit_entry(le, new C_Locker_FileUpdate_finish(this, in, ls)); } - if (dirty || last_wr) - mds->mdlog->submit_entry(new EFileWrite(mds->mdlog, in)); // reevaluate, waiters if (!in->filelock.is_stable()) diff --git a/src/mds/LogEvent.cc b/src/mds/LogEvent.cc index c71299fb20730..65b0bb2ec1322 100644 --- a/src/mds/LogEvent.cc +++ b/src/mds/LogEvent.cc @@ -32,9 +32,6 @@ #include "events/ESlaveUpdate.h" #include "events/EOpen.h" -#include "events/EFileWrite.h" -#include "events/EFileAccess.h" - #include "events/EPurgeFinish.h" #include "events/EAnchor.h" @@ -73,9 +70,6 @@ LogEvent *LogEvent::decode(bufferlist& bl) case EVENT_SLAVEUPDATE: le = new ESlaveUpdate; break; case EVENT_OPEN: le = new EOpen; break; - case EVENT_FILEWRITE: le = new EFileWrite; break; - case EVENT_FILEACCESS: le = new EFileAccess; break; - case EVENT_PURGEFINISH: le = new EPurgeFinish; break; case EVENT_ANCHOR: le = new EAnchor; break; diff --git a/src/mds/LogEvent.h b/src/mds/LogEvent.h index aa792708cd1b1..8d36a1d515c1c 100644 --- a/src/mds/LogEvent.h +++ b/src/mds/LogEvent.h @@ -29,8 +29,6 @@ #define EVENT_UPDATE 20 #define EVENT_SLAVEUPDATE 21 #define EVENT_OPEN 22 -#define EVENT_FILEWRITE 23 -#define EVENT_FILEACCESS 24 #define EVENT_PURGEFINISH 30 diff --git a/src/mds/Server.cc b/src/mds/Server.cc index e4f94b7393224..1121d21d2f1a1 100644 --- a/src/mds/Server.cc +++ b/src/mds/Server.cc @@ -4099,6 +4099,7 @@ void Server::handle_client_openc(MDRequest *mdr) in->inode.mode = req->head.args.open.mode; in->inode.mode |= S_IFREG; in->inode.version = dn->pre_dirty() - 1; + in->inode.max_size = in->inode.layout.fl_object_size * in->inode.layout.fl_stripe_count; // prepare finisher C_MDS_openc_finish *fin = new C_MDS_openc_finish(mds, mdr, dn, in); diff --git a/src/mds/events/EFileAccess.h b/src/mds/events/EFileAccess.h deleted file mode 100644 index 232f21c80a7aa..0000000000000 --- a/src/mds/events/EFileAccess.h +++ /dev/null @@ -1,51 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MDS_EFILEACCESS_H -#define __MDS_EFILEACCESS_H - -#include "../LogEvent.h" -#include "EMetaBlob.h" - -class EFileAccess : public LogEvent { -public: - inodeno_t ino; - utime_t atime; - - EFileAccess() : LogEvent(EVENT_FILEACCESS) { } - EFileAccess(MDLog *mdlog, CInode *in) : - LogEvent(EVENT_FILEACCESS) { - ino = in->inode.ino; - atime = in->inode.atime; - } - - void print(ostream& out) { - out << "EFileAccess " << ino - << " atime " << atime; - } - - void encode_payload(bufferlist& bl) { - ::_encode(ino, bl); - ::_encode(atime, bl); - } - void decode_payload(bufferlist& bl, int& off) { - ::_decode(ino, bl, off); - ::_decode(atime, bl, off); - } - - void update_segment(); - void replay(MDS *mds); -}; - -#endif diff --git a/src/mds/events/EFileWrite.h b/src/mds/events/EFileWrite.h deleted file mode 100644 index e6300a572cd84..0000000000000 --- a/src/mds/events/EFileWrite.h +++ /dev/null @@ -1,60 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MDS_EFILEWRITE_H -#define __MDS_EFILEWRITE_H - -#include "../LogEvent.h" -#include "EMetaBlob.h" - -class EFileWrite : public LogEvent { -public: - inodeno_t ino; - __u64 size, max_size; - utime_t mtime; - - EFileWrite() : LogEvent(EVENT_FILEWRITE) { } - EFileWrite(MDLog *mdlog, CInode *in) : - LogEvent(EVENT_FILEWRITE) { - ino = in->inode.ino; - size = in->inode.size; - max_size = in->inode.max_size; - mtime = in->inode.mtime; - } - - void print(ostream& out) { - out << "EFileWrite " << ino - << " size " << size - << " max " << max_size - << " mtime " << mtime; - } - - void encode_payload(bufferlist& bl) { - ::_encode(ino, bl); - ::_encode(size, bl); - ::_encode(max_size, bl); - ::_encode(mtime, bl); - } - void decode_payload(bufferlist& bl, int& off) { - ::_decode(ino, bl, off); - ::_decode(size, bl, off); - ::_decode(max_size, bl, off); - ::_decode(mtime, bl, off); - } - - void update_segment(); - void replay(MDS *mds); -}; - -#endif diff --git a/src/mds/journal.cc b/src/mds/journal.cc index 0ffcedde9b5d7..8bb875aea5e71 100644 --- a/src/mds/journal.cc +++ b/src/mds/journal.cc @@ -23,9 +23,6 @@ #include "events/ESlaveUpdate.h" #include "events/EOpen.h" -#include "events/EFileWrite.h" -#include "events/EFileAccess.h" - #include "events/EPurgeFinish.h" #include "events/EExport.h" @@ -603,48 +600,6 @@ void EOpen::replay(MDS *mds) } -// ------------------------ -// EFileWrite - -void EFileWrite::update_segment() -{ - // ?? -} - -void EFileWrite::replay(MDS *mds) -{ - dout(10) << "EFileWrite.replay " << dendl; - CInode *in = mds->mdcache->get_inode(ino); - if (in) { - in->inode.size = size; - in->inode.max_size = max_size; - in->inode.mtime = mtime; - } else { - dout(10) << " missing inode " << ino << dendl; - //assert(in); - } -} - -// ------------------------ -// EFileAccess - -void EFileAccess::update_segment() -{ - // ?? -} - -void EFileAccess::replay(MDS *mds) -{ - dout(10) << "EFileAccess.replay " << dendl; - CInode *in = mds->mdcache->get_inode(ino); - if (in) - in->inode.atime = atime; - else { - dout(10) << " missing inode " << ino << dendl; - //assert(in); - } -} - // ----------------------- // ESlaveUpdate diff --git a/src/messages/MClientFileCaps.h b/src/messages/MClientFileCaps.h index 3cec527dbc819..d33f97b615f0a 100644 --- a/src/messages/MClientFileCaps.h +++ b/src/messages/MClientFileCaps.h @@ -39,7 +39,8 @@ class MClientFileCaps : public Message { capseq_t get_seq() { return le32_to_cpu(h.seq); } inodeno_t get_ino() { return le64_to_cpu(h.ino); } - __u64 get_size() { return le64_to_cpu(h.size); } + __s64 get_size() { return le64_to_cpu(h.size); } + __s64 get_max_size() { return le64_to_cpu(h.max_size); } utime_t get_mtime() { return utime_t(h.mtime); } utime_t get_atime() { return utime_t(h.atime); } @@ -74,6 +75,7 @@ class MClientFileCaps : public Message { h.wanted = cpu_to_le32(wanted); h.ino = cpu_to_le64(inode.ino); h.size = cpu_to_le64(inode.size); + h.max_size = cpu_to_le64(inode.max_size); h.migrate_mds = cpu_to_le32(mmds); h.migrate_seq = cpu_to_le32(mseq); inode.mtime.encode_timeval(&h.mtime); @@ -87,6 +89,7 @@ class MClientFileCaps : public Message { << " seq " << le32_to_cpu(h.seq) << " caps " << cap_string(le32_to_cpu(h.caps)) << " wanted" << cap_string(le32_to_cpu(h.wanted)) + << " size " << le64_to_cpu(h.size) << "/" << le64_to_cpu(h.max_size) << ")"; } diff --git a/src/messages/MClientReply.h b/src/messages/MClientReply.h index bdd3c6e26e14e..f7831fc1dfcf6 100644 --- a/src/messages/MClientReply.h +++ b/src/messages/MClientReply.h @@ -115,6 +115,7 @@ struct InodeStat { inode.gid = e.gid; inode.nlink = e.nlink; inode.size = e.size; + inode.max_size = e.max_size; inode.rdev = e.rdev; int n = e.fragtree.nsplits; @@ -152,6 +153,7 @@ struct InodeStat { e.gid = in->inode.gid; e.nlink = in->inode.nlink; e.size = in->inode.size; + e.max_size = in->inode.max_size; e.rdev = in->inode.rdev; e.mask = mask; e.fragtree.nsplits = in->dirfragtree._splits.size(); @@ -209,7 +211,6 @@ class MClientReply : public Message { memset(&st, 0, sizeof(st)); this->st.tid = req->get_tid(); this->st.op = req->get_op(); - this->st.result = result; } virtual ~MClientReply() { -- 2.39.5