From 4a499ee25cb848062873e87b1bd7c91f3d7722c5 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 5 Jan 2009 16:30:10 -0800 Subject: [PATCH] mds: bunch of cap protocol changes, and user client adjustments. See capability protocol notes in mds/Capability.h --- src/TODO | 43 +++- src/client/Client.cc | 147 ++++++++----- src/client/Client.h | 24 ++- src/include/ceph_fs.h | 27 ++- src/kernel/caps.c | 7 +- src/mds/CInode.cc | 3 +- src/mds/CInode.h | 4 + src/mds/Capability.h | 52 ++++- src/mds/Locker.cc | 411 ++++++++++++++++++------------------- src/mds/MDCache.cc | 11 +- src/mds/Migrator.cc | 11 +- src/messages/MClientCaps.h | 16 +- 12 files changed, 445 insertions(+), 311 deletions(-) diff --git a/src/TODO b/src/TODO index 4fe87604f78a7..55b652e8c2dca 100644 --- a/src/TODO +++ b/src/TODO @@ -46,19 +46,42 @@ timer caps -- notes - - client should not get unsolicited MClientCaps if wanted==0. - - if we do un-acked cap release, we need to handle unsolicited import/export - - may unacked release _only_ if wanted==0? -- maybe we really want a 'dirty' mask - - client knows when it has dirty data to writeback, and thus when it must wait for it to flush? + +- two types of cap events from mds -> client + - cap issue .. in a reply, or an IMPORT cap op. + - cap update (revocation, etc.) .. an MClientCaps message. +- if client has cap, mds should too. +- if client has no dirty data, it can release without waiting for an ack. + - client may thus get a cap _update_ and not have the cap. ignore it. + - mds should track seq of last _issue_ (not update). any release attempt will only succeed if the client has seen the latest issue. + - if client gets an IMPORT issue and doesn't have the inode, immediately send a release. + +- a FLUSH writes dirty metadata. it also usually releases wr/excl caps. + - 'dirty' is which metadata is to be written. + - 'caps' is which caps are being retained by the client. +? - if 0, we can discard the cap. + - client gets a FLUSH_ACK with matching dirty flags indicating which caps were written. +- a FLUSH_ACK acks a FLUSH. + - 'dirty' is the _original_ FLUSH's dirty (i.e., which metadata was written back) + - 'seq' is the _original_ FLUSH's seq. + - 'caps' is the _original_ FLUSH's caps. +- a FLUSHSNAP flushes snapshot metadata. + - 'dirty' indicates which caps, were dirty, if any. + - mds writes metadata. if dirty!=0, replies with FLUSHSNAP_ACK. +- a RELEASE releases one or more (clean) caps. + - 'caps' is which caps are retained by the client. + - 'wanted' is which caps the client wants. + - dirty==0 + - if caps==0, mds can close out the cap (provided there are no racing cap issues) +- a WANTED simply updates the client's 'wanted' mask. + - 'wanted' is which caps the client wants. + - other fields should be ignored? - kclient - - only pin caps with wanted != 0? - - put unwanted caps on an lru list; expire + - only pin caps with dirty metadata? and/or wanted != 0? + - and/or, put unwanted caps on an lru list, and expire? - mds - - segregate wanted/unwanted caps? - + - segregate wanted/unwanted caps? rd/wr caps? kernel client diff --git a/src/client/Client.cc b/src/client/Client.cc index 89f90d58466c4..82e36fe0c4be7 100644 --- a/src/client/Client.cc +++ b/src/client/Client.cc @@ -413,6 +413,7 @@ void Client::update_inode(Inode *in, InodeStat *st, utime_t from, int mds) } if ((issued & CEPH_CAP_XATTR_EXCL) == 0) { +#warning xattrs in->xattrs.swap(st->xattrs); } @@ -1374,7 +1375,7 @@ void Client::put_inode(Inode *in, int n) if (in->snapdir_parent) put_inode(in->snapdir_parent); inode_map.erase(in->vino()); - in->cap_delay_item.remove_myself(); + in->cap_item.remove_myself(); in->snaprealm_item.remove_myself(); if (in == root) root = 0; delete in; @@ -1447,13 +1448,16 @@ void Client::cap_delay_requeue(Inode *in) in->hold_caps_until = g_clock.now(); in->hold_caps_until += 5.0; - delayed_caps.push_back(&in->cap_delay_item); + delayed_caps.push_back(&in->cap_item); } void Client::check_caps(Inode *in, bool is_delayed) { unsigned wanted = in->caps_wanted(); unsigned used = in->caps_used(); + int like = wanted; + if (!unmounting) + like |= CEPH_CAP_ANY_RD; dout(10) << "check_caps on " << *in << " wanted " << ccap_string(wanted) @@ -1482,21 +1486,24 @@ void Client::check_caps(Inode *in, bool is_delayed) InodeCap *cap = it->second; it++; - int like = wanted; - if (!unmounting) - like |= CEPH_CAP_ANY_RD; - int revoking = cap->implemented & ~cap->issued; + dout(10) << " cap mds" << mds + << " issued " << ccap_string(cap->issued) + << " implemented " << ccap_string(cap->implemented) + << " revoking " << ccap_string(revoking) << dendl; + if (in->wanted_max_size > in->inode.max_size && in->wanted_max_size > in->requested_max_size) goto ack; /* completed revocation? */ if (revoking && (revoking && used) == 0) { - dout(10) << "completed revocation of " << (cap->implemented & ~cap->issued) << dendl; + dout(10) << "completed revocation of " << ccap_string(cap->implemented & ~cap->issued) << dendl; goto ack; } + if (!revoking && unmounting) + goto ack; /* approaching file_max? */ if ((cap->issued & CEPH_CAP_FILE_WR) && @@ -1518,24 +1525,35 @@ void Client::check_caps(Inode *in, bool is_delayed) } ack: - int op; - if (wanted == 0) + int op = CEPH_CAP_OP_UPDATE; + if (like == 0) op = CEPH_CAP_OP_RELEASE; else - op = CEPH_CAP_OP_ACK; + op = CEPH_CAP_OP_UPDATE; + + cap->flushing |= in->caps_dirty() & cap->issued; + if (cap->flushing) { + dout(10) << " flushing " << ccap_string(cap->flushing) << dendl; + in->dirty_caps &= ~cap->flushing; + } + cap->issued &= like; + MClientCaps *m = new MClientCaps(op, in->inode, 0, cap->seq, cap->issued, wanted, + cap->flushing, cap->mseq); - cap->wanted = wanted; in->reported_size = in->inode.size; m->set_max_size(in->wanted_max_size); in->requested_max_size = in->wanted_max_size; m->set_snap_follows(in->snaprealm->get_snap_context().seq); messenger->send_message(m, mdsmap->get_inst(mds)); + + if (cap->flushing == 0 && cap->issued == 0) + remove_cap(in, mds); } } @@ -1731,6 +1749,7 @@ void Client::add_update_cap(Inode *in, int mds, in->exporting_mseq = 0; } in->caps[mds] = cap = new InodeCap; + cap_list.push_back(&in->cap_item); } unsigned old_caps = cap->issued; @@ -1739,7 +1758,9 @@ void Client::add_update_cap(Inode *in, int mds, cap->seq = seq; cap->mseq = mseq; dout(10) << "add_update_cap issued " << ccap_string(old_caps) << " -> " << ccap_string(cap->issued) - << " from mds" << mds << dendl; + << " from mds" << mds + << " on " << *in + << dendl; if (issued & ~old_caps) signal_cond_list(in->waitfor_caps); @@ -1747,6 +1768,7 @@ void Client::add_update_cap(Inode *in, int mds, void Client::remove_cap(Inode *in, int mds) { + dout(10) << "remove_cap mds" << mds << " on " << *in << dendl; assert(in->caps.count(mds)); in->caps.erase(mds); if (in->caps.empty()) { @@ -1999,8 +2021,15 @@ void Client::handle_caps(MClientCaps *m) if (inode_map.count(vino)) in = inode_map[vino]; if (!in) { dout(5) << "handle_caps don't have vino " << vino << dendl; - assert(0); // shouldn't happen - delete m; + + if (m->get_op() == CEPH_CAP_OP_IMPORT) { + // release. + m->set_op(CEPH_CAP_OP_RELEASE); + m->head.caps = 0; + m->head.dirty = 0; + messenger->send_message(m, m->get_source_inst()); + } else + delete m; return; } @@ -2009,8 +2038,8 @@ void Client::handle_caps(MClientCaps *m) case CEPH_CAP_OP_EXPORT: return handle_cap_export(in, m); case CEPH_CAP_OP_TRUNC: return handle_cap_trunc(in, m); case CEPH_CAP_OP_GRANT: return handle_cap_grant(in, m); - case CEPH_CAP_OP_RELEASED: return handle_cap_released(in, m); - case CEPH_CAP_OP_FLUSHEDSNAP: return handle_cap_flushedsnap(in, m); + case CEPH_CAP_OP_FLUSHSNAP_ACK: return handle_cap_flushsnap_ack(in, m); + case CEPH_CAP_OP_FLUSH_ACK: return handle_cap_flush_ack(in, m); default: delete m; } @@ -2102,18 +2131,29 @@ void Client::handle_cap_trunc(Inode *in, MClientCaps *m) delete m; } -void Client::handle_cap_released(Inode *in, MClientCaps *m) +void Client::handle_cap_flush_ack(Inode *in, MClientCaps *m) { int mds = m->get_source().num(); assert(in->caps[mds]); + InodeCap *cap = in->caps[mds]; - dout(5) << "handle_cap_released mds" << mds << " released cap on " << *in << dendl; - remove_cap(in, mds); + int cleaned = m->get_dirty() & ~m->get_caps(); + dout(5) << "handle_cap_flush_ack mds" << mds + << " cleaned " << ccap_string(cleaned) << " on " << *in << dendl; + cap->flushing &= ~cleaned; + dout(5) << " cap->flushing now " << ccap_string(cap->flushing) + << ", in->caps_dirty() now " << ccap_string(in->caps_dirty()) << dendl; + + if (m->get_caps() == 0 && + m->get_seq() == cap->seq) { + assert(in->caps_dirty() == 0); + remove_cap(in, mds); + } delete m; } -void Client::handle_cap_flushedsnap(Inode *in, MClientCaps *m) +void Client::handle_cap_flushsnap_ack(Inode *in, MClientCaps *m) { int mds = m->get_source().num(); assert(in->caps[mds]); @@ -2127,7 +2167,7 @@ void Client::handle_cap_flushedsnap(Inode *in, MClientCaps *m) } else { dout(5) << "handle_cap_flushedsnap DUP(?) mds" << mds << " flushed snap follows " << follows << " on " << *in << dendl; - // we may not have it if we send multiple FLUSHSNAP requests and (get multiple FLUSHEDSNAPs back) + // we may not have it if we send multiple FLUSHSNAP requests and (got multiple FLUSHEDSNAPs back) } delete m; @@ -2137,15 +2177,18 @@ void Client::handle_cap_flushedsnap(Inode *in, MClientCaps *m) void Client::handle_cap_grant(Inode *in, MClientCaps *m) { int mds = m->get_source().num(); - assert(in->caps[mds]); + if (in->caps.count(mds) == 0) { + dout(5) << "handle_cap_grant on ino " << m->get_ino() << " no cap for mds" << mds << dendl; + delete m; + return; + } InodeCap *cap = in->caps[mds]; - cap->seq = m->get_seq(); in->inode.layout = m->get_layout(); // don't want it? - int wanted = in->caps_wanted(); /* + int wanted = in->caps_wanted(); if (wanted == 0) { dout(5) << "handle_cap_grant on ino " << m->get_ino() << " seq " << m->get_seq() @@ -2169,7 +2212,20 @@ void Client::handle_cap_grant(Inode *in, MClientCaps *m) << " caps now " << ccap_string(new_caps) << " was " << ccap_string(old_caps) << dendl; - // size/ctime/mtime/atime + // update inode + int issued = in->caps_issued(); + if ((issued & CEPH_CAP_AUTH_EXCL) == 0) { + in->inode.mode = m->head.mode; + in->inode.uid = m->head.uid; + in->inode.gid = m->head.gid; + } + if ((issued & CEPH_CAP_LINK_EXCL) == 0) { + in->inode.nlink = m->head.nlink; + in->inode.anchored = false; /* lie */ + } + if ((issued & CEPH_CAP_XATTR_EXCL) == 0) { +#warning xattrs in->xattrs.swap(m->xattrs); + } update_inode_file_bits(in, m->get_truncate_seq(), m->get_size(), m->get_time_warp_seq(), m->get_ctime(), m->get_mtime(), m->get_atime(), old_caps); @@ -2186,9 +2242,6 @@ void Client::handle_cap_grant(Inode *in, MClientCaps *m) } // update caps - - bool ack = false; - if (old_caps & ~new_caps) { dout(10) << " revocation of " << ccap_string(~new_caps & old_caps) << dendl; cap->issued = new_caps; @@ -2198,20 +2251,9 @@ void Client::handle_cap_grant(Inode *in, MClientCaps *m) if ((used & ~new_caps) & CEPH_CAP_FILE_WRBUFFER) _flush(in); - else { - ack = true; - cap->implemented = new_caps; - - // share our (possibly newer) file size, mtime, atime - m->set_size(in->inode.size); - m->set_max_size(0); // dont re-request - m->set_mtime(in->inode.mtime); - m->set_atime(in->inode.atime); - m->set_wanted(wanted); - cap->wanted = wanted; - m->set_snap_follows(in->snaprealm->get_snap_context().seq); - m->set_migrate_seq(cap->mseq); - } + else + check_caps(in, false); + } else if (old_caps == new_caps) { dout(10) << " caps unchanged at " << ccap_string(old_caps) << dendl; } else { @@ -2223,10 +2265,7 @@ void Client::handle_cap_grant(Inode *in, MClientCaps *m) if (new_caps) signal_cond_list(in->waitfor_caps); - if (ack) - messenger->send_message(m, m->get_source_inst()); - else - delete m; + delete m; } @@ -2360,11 +2399,7 @@ int Client::unmount() // NOTE: i'm assuming all caches are already flushing (because all files are closed). assert(fd_map.empty()); - dout(10) << "a" << dendl; - _ll_drop_pins(); - - dout(10) << "b" << dendl; // empty lru cache lru.lru_set_max(0); @@ -2379,6 +2414,14 @@ int Client::unmount() check_caps(in, true); } + // other caps, too + p = cap_list.begin(); + while (!p.end()) { + Inode *in = *p; + ++p; + check_caps(in, true); + } + if (g_conf.client_oc) { // release any/all caps hash_map::iterator next; @@ -2499,6 +2542,7 @@ void Client::tick() if (in->hold_caps_until > now) break; delayed_caps.pop_front(); + cap_list.push_back(&in->cap_item); check_caps(in, true); } } @@ -2975,6 +3019,7 @@ int Client::_utimes(const filepath &path, utime_t mtime, utime_t atime, int mask dn->inode->inode.mtime = mtime; if (mask & CEPH_UTIME_ATIME) dn->inode->inode.atime = atime; + dn->inode->dirty_caps |= CEPH_CAP_FILE_EXCL; return 0; } @@ -3905,6 +3950,7 @@ int Client::_write(Fh *f, __s64 offset, __u64 size, const char *buf) // extend file? if (totalwritten + offset > in->inode.size) { in->inode.size = totalwritten + offset; + in->dirty_caps |= CEPH_CAP_FILE_WR; if ((in->inode.size << 1) >= in->inode.max_size && (in->reported_size << 1) < in->inode.max_size) @@ -3917,6 +3963,7 @@ int Client::_write(Fh *f, __s64 offset, __u64 size, const char *buf) // mtime in->inode.mtime = g_clock.real_now(); + in->dirty_caps |= CEPH_CAP_FILE_WR; put_cap_ref(in, CEPH_CAP_FILE_WR); diff --git a/src/client/Client.h b/src/client/Client.h index c7ae260069b2f..8bb18d2037fc1 100644 --- a/src/client/Client.h +++ b/src/client/Client.h @@ -162,10 +162,11 @@ struct InodeCap { unsigned issued; unsigned implemented; unsigned wanted; // as known to mds. + unsigned flushing; __u64 seq; __u32 mseq; // migration seq - InodeCap() : issued(0), implemented(0), wanted(0), seq(0), mseq(0) {} + InodeCap() : issued(0), implemented(0), wanted(0), flushing(0), seq(0), mseq(0) {} }; struct CapSnap { @@ -194,12 +195,13 @@ class Inode { // per-mds caps map caps; // mds -> InodeCap + unsigned dirty_caps; int snap_caps, snap_cap_refs; unsigned exporting_issued; int exporting_mds; capseq_t exporting_mseq; utime_t hold_caps_until; - xlist::item cap_delay_item; + xlist::item cap_item; SnapRealm *snaprealm; xlist::item snaprealm_item; @@ -263,9 +265,10 @@ class Inode { snapid(vino.snapid), lease_mask(0), lease_mds(-1), dir_auth(-1), dir_hashed(false), dir_replicated(false), + dirty_caps(0), snap_caps(0), snap_cap_refs(0), exporting_issued(0), exporting_mds(-1), exporting_mseq(0), - cap_delay_item(this), + cap_item(this), snaprealm(0), snaprealm_item(this), snapdir_parent(0), reported_size(0), wanted_max_size(0), requested_max_size(0), ref(0), ll_ref(0), @@ -333,6 +336,14 @@ class Inode { want |= CEPH_CAP_FILE_EXCL; return want; } + int caps_dirty() { + int flushing = dirty_caps; + for (map::iterator it = caps.begin(); + it != caps.end(); + it++) + flushing |= it->second->flushing; + return flushing; + } bool have_valid_size() { // RD+RDCACHE or WR+WRBUFFER => valid size @@ -614,7 +625,8 @@ protected: Inode* root; LRU lru; // lru list of Dentry's in our local metadata cache. - xlist delayed_caps; + // all inodes with caps sit on either cap_list or delayed_caps. + xlist delayed_caps, cap_list; hash_map snap_realms; SnapRealm *get_snap_realm(inodeno_t r) { @@ -828,8 +840,8 @@ protected: void handle_cap_import(Inode *in, class MClientCaps *m); void handle_cap_export(Inode *in, class MClientCaps *m); void handle_cap_trunc(Inode *in, class MClientCaps *m); - void handle_cap_released(Inode *in, class MClientCaps *m); - void handle_cap_flushedsnap(Inode *in, class MClientCaps *m); + void handle_cap_flush_ack(Inode *in, class MClientCaps *m); + void handle_cap_flushsnap_ack(Inode *in, class MClientCaps *m); void handle_cap_grant(Inode *in, class MClientCaps *m); void cap_delay_requeue(Inode *in); void check_caps(Inode *in, bool is_delayed); diff --git a/src/include/ceph_fs.h b/src/include/ceph_fs.h index 4ca9a92d3534e..9d9684e88ae9f 100644 --- a/src/include/ceph_fs.h +++ b/src/include/ceph_fs.h @@ -938,10 +938,11 @@ static inline int ceph_flags_to_mode(int flags) CEPH_CAP_FILE_RDCACHE | \ CEPH_CAP_XATTR_RDCACHE) -#define CEPH_CAP_ANY_RD (CEPH_CAP_AUTH_RDCACHE | \ - CEPH_CAP_LINK_RDCACHE | \ - CEPH_CAP_XATTR_RDCACHE | \ - CEPH_CAP_FILE_RDCACHE | CEPH_CAP_FILE_RD) +#define CEPH_CAP_ANY_RDCACHE (CEPH_CAP_AUTH_RDCACHE | \ + CEPH_CAP_LINK_RDCACHE | \ + CEPH_CAP_XATTR_RDCACHE | \ + CEPH_CAP_FILE_RDCACHE) +#define CEPH_CAP_ANY_RD (CEPH_CAP_ANY_RDCACHE | CEPH_CAP_FILE_RD) #define CEPH_CAP_ANY_EXCL (CEPH_CAP_AUTH_EXCL | \ CEPH_CAP_LINK_EXCL | \ @@ -986,12 +987,11 @@ enum { CEPH_CAP_OP_TRUNC, /* mds->client trunc notify */ CEPH_CAP_OP_EXPORT, /* mds has exported the cap */ CEPH_CAP_OP_IMPORT, /* mds has imported the cap from specified mds */ - CEPH_CAP_OP_RELEASED, /* mds->client close out cap */ - CEPH_CAP_OP_FLUSHEDSNAP, /* mds->client flushed snap */ - CEPH_CAP_OP_ACK, /* client->mds ack (if prior grant was recall) */ - CEPH_CAP_OP_REQUEST, /* client->mds request (update wanted bits) */ + CEPH_CAP_OP_UPDATE, /* client->mds update */ + CEPH_CAP_OP_FLUSH_ACK, /* mds->client flushed. if caps=0, cap also released. */ CEPH_CAP_OP_FLUSHSNAP, /* client->mds flush snapped metadata */ - CEPH_CAP_OP_RELEASE, /* client->mds request release cap */ + CEPH_CAP_OP_FLUSHSNAP_ACK, /* mds->client flushed snapped metadata */ + CEPH_CAP_OP_RELEASE, /* client->mds release (clean) cap */ }; static inline const char *ceph_cap_op_name(int op) @@ -1001,11 +1001,10 @@ static inline const char *ceph_cap_op_name(int op) case CEPH_CAP_OP_TRUNC: return "trunc"; case CEPH_CAP_OP_EXPORT: return "export"; case CEPH_CAP_OP_IMPORT: return "import"; - case CEPH_CAP_OP_RELEASED: return "released"; - case CEPH_CAP_OP_FLUSHEDSNAP: return "flushedsnap"; - case CEPH_CAP_OP_ACK: return "ack"; - case CEPH_CAP_OP_REQUEST: return "request"; + case CEPH_CAP_OP_UPDATE: return "update"; + case CEPH_CAP_OP_FLUSH_ACK: return "flush_ack"; case CEPH_CAP_OP_FLUSHSNAP: return "flushsnap"; + case CEPH_CAP_OP_FLUSHSNAP_ACK: return "flushsnap_ack"; case CEPH_CAP_OP_RELEASE: return "release"; default: return "???"; } @@ -1018,7 +1017,7 @@ struct ceph_mds_caps { __le32 op; __le64 ino, realm; __le32 seq; - __le32 caps, wanted; + __le32 caps, wanted, dirty; __le32 migrate_seq; __le64 snap_follows; __le32 snap_trace_len; diff --git a/src/kernel/caps.c b/src/kernel/caps.c index f50f6ff2d6c76..aa3c05e6c7deb 100644 --- a/src/kernel/caps.c +++ b/src/kernel/caps.c @@ -644,13 +644,16 @@ retry_locked: } /* completed revocation? going down and there are no caps? */ - if ((revoking && (revoking & used) == 0) || - (mdsc->stopping && (used == 0))) { + if (revoking && (revoking & used) == 0) { dout(10, "completed revocation of %d\n", cap->implemented & ~cap->issued); goto ack; } + /* shutting down? */ + if (!revoking && mdsc->stopping && (used == 0)) + goto ack; + if ((cap->issued & ~(file_wanted | used)) == 0) continue; /* nothing extra, all good */ diff --git a/src/mds/CInode.cc b/src/mds/CInode.cc index fa6d9500a42f7..ee04c2a61967b 100644 --- a/src/mds/CInode.cc +++ b/src/mds/CInode.cc @@ -1355,7 +1355,8 @@ bool CInode::encode_inodestat(bufferlist& bl, Session *session, } if (cap && valid) { bool loner = (get_loner() == client); - int issue = (cap->wanted() | CEPH_CAP_ANY_RD) & get_caps_allowed(loner); + int likes = get_caps_liked(); + int issue = (cap->wanted() | likes) & get_caps_allowed(loner); int pending = cap->pending(); if (issue & ~pending) { dout(10) << "encode_inodestat issuing " << ccap_string(issue) diff --git a/src/mds/CInode.h b/src/mds/CInode.h index c7842ee4f560b..f5829fae024f3 100644 --- a/src/mds/CInode.h +++ b/src/mds/CInode.h @@ -602,6 +602,10 @@ public: } // caps allowed + int get_caps_liked() { + return CEPH_CAP_PIN | + (is_dir() ? CEPH_CAP_ANY_RDCACHE : CEPH_CAP_ANY_RD); + } int get_caps_allowed_ever() { return CEPH_CAP_PIN | diff --git a/src/mds/Capability.h b/src/mds/Capability.h index 0cdb1d7a60e52..3b4081362854c 100644 --- a/src/mds/Capability.h +++ b/src/mds/Capability.h @@ -24,6 +24,48 @@ using namespace std; #include "config.h" + +/* + + Capability protocol notes. + +- two types of cap events from mds -> client: + - cap "issue" in a MClientReply, or an MClientCaps IMPORT op. + - cap "update" (revocation, etc.) .. an MClientCaps message. +- if client has cap, the mds should have it too. + +- if client has no dirty data, it can release it without waiting for an mds ack. + - client may thus get a cap _update_ and not have the cap. ignore it. + +- mds should track seq of last _issue_ (not update). any release + attempt will only succeed if the client has seen the latest issue. + - if client gets an IMPORT issue and doesn't have the inode, immediately send a release. + +- a UPDATE updates the clients issued caps, wanted, etc. it may also flush dirty metadata. + - 'caps' are which caps the client retains. + - if 0, client wishes to release the cap + - 'wanted' is which caps the client wants. + - 'dirty' is which metadata is to be written. + - client gets a FLUSH_ACK with matching dirty flags indicating which caps were written. + +- a FLUSH_ACK acks a FLUSH. + - 'dirty' is the _original_ FLUSH's dirty (i.e., which metadata was written back) + - 'seq' is the _original_ FLUSH's seq. + - 'caps' is the _original_ FLUSH's caps. + - client can conclude that (dirty & ~caps) bits were successfully cleaned. + +- a FLUSHSNAP flushes snapshot metadata. + - 'dirty' indicates which caps, were dirty, if any. + - mds writes metadata. if dirty!=0, replies with FLUSHSNAP_ACK. + +- a RELEASE releases one or more (clean) caps. + - 'caps' is which caps are retained by the client. + - 'wanted' is which caps the client wants. + - dirty==0 + - if caps==0, mds can close out the cap (provided there are no racing cap issues) + + */ + class CInode; class Capability { @@ -59,7 +101,7 @@ private: map cap_history; // seq -> cap, [last_recv,last_sent] capseq_t last_sent, last_recv; - capseq_t last_open; + capseq_t last_issue; capseq_t mseq; int suppress; @@ -79,7 +121,7 @@ public: wanted_caps(want), last_sent(s), last_recv(s), - last_open(0), + last_issue(0), mseq(0), suppress(0), stale(false), releasing(0), client_follows(0), @@ -88,8 +130,8 @@ public: capseq_t get_mseq() { return mseq; } capseq_t get_last_sent() { return last_sent; } - capseq_t get_last_open() { return last_open; } - void set_last_open() { last_open = last_sent; } + + capseq_t get_last_issue() { return last_issue; } bool is_suppress() { return suppress > 0; } void inc_suppress() { suppress++; } @@ -145,7 +187,7 @@ public: // issue caps; return seq number. capseq_t issue(int c) { - ++last_sent; + last_issue = ++last_sent; cap_history[last_sent] = c; return last_sent; } diff --git a/src/mds/Locker.cc b/src/mds/Locker.cc index d769db1dfec66..2e5e8f2ec1ac7 100644 --- a/src/mds/Locker.cc +++ b/src/mds/Locker.cc @@ -570,7 +570,6 @@ Capability* Locker::issue_new_caps(CInode *in, // re-issue whatever we can //cap->issue(cap->pending()); - //cap->set_last_open(); // not used, atm. return cap; } @@ -639,8 +638,7 @@ bool Locker::issue_caps(CInode *in) nissued++; // include caps that clients generally like, while we're at it. - int likes = CEPH_CAP_ANY_RD; - + int likes = in->get_caps_liked(); int before = pending; long seq = cap->issue((wanted|likes) & allowed); int after = cap->pending(); @@ -655,8 +653,7 @@ bool Locker::issue_caps(CInode *in) in->inode, in->find_snaprealm()->inode->ino(), cap->get_last_seq(), - after, - wanted, + after, wanted, 0, cap->get_mseq()), it->first); } @@ -678,8 +675,7 @@ void Locker::issue_truncate(CInode *in) in->inode, in->find_snaprealm()->inode->ino(), cap->get_last_seq(), - cap->pending(), - cap->wanted(), + cap->pending(), cap->wanted(), 0, cap->get_mseq()), it->first); } @@ -966,12 +962,11 @@ void Locker::share_inode_max_size(CInode *in) if (cap->pending() & (CEPH_CAP_GWR<send_message_client(new MClientCaps(CEPH_CAP_OP_GRANT, - in->inode, - in->find_snaprealm()->inode->ino(), - cap->get_last_seq(), - cap->pending(), - cap->wanted(), - cap->get_mseq()), + in->inode, + in->find_snaprealm()->inode->ino(), + cap->get_last_seq(), + cap->pending(), cap->wanted(), 0, + cap->get_mseq()), client); } } @@ -1034,98 +1029,101 @@ void Locker::handle_client_caps(MClientCaps *m) return; } - // flushsnap? - if (m->get_op() == CEPH_CAP_OP_FLUSHSNAP) { - dout(7) << " flushsnap follows " << follows - << " client" << client << " on " << *in << dendl; - // this cap now follows a later snap (i.e. the one initiating this flush, or later) - cap->client_follows = follows+1; - - if (in->last <= follows) { - dout(10) << " flushsnap releasing cloned cap" << dendl; - in->remove_client_cap(client); - } else { - dout(10) << " flushsnap NOT releasing live cap" << dendl; - } - - // we can prepare the ack now, since this FLUSHEDSNAP is independent of any - // other cap ops. (except possibly duplicate FLUSHSNAP requests, but worst - // case we get a dup response, so whatever.) - MClientCaps *ack = new MClientCaps(CEPH_CAP_OP_FLUSHEDSNAP, in->inode, 0, 0, 0, 0, 0); - ack->set_snap_follows(follows); - if (!_do_cap_update(in, m->get_caps(), in->get_caps_wanted(), follows, m, ack)) { - mds->send_message_client(ack, client); - eval_cap_gather(in); - } - } else { + int op = m->get_op(); - // for this and all subsequent versions of this inode, - while (1) { - // filter wanted based on what we could ever give out (given auth/replica status) - int wanted = m->get_wanted() & head_in->get_caps_allowed_ever(); - int had = cap->confirm_receipt(m->get_seq(), m->get_caps()); - int has = cap->confirmed(); - dout(10) << " follows " << follows - << ", had " << ccap_string(had) - << ", has " << ccap_string(has) - << " on " << *in << dendl; + // flushsnap? + if (op == CEPH_CAP_OP_FLUSHSNAP) { + if (in->is_auth()) { + dout(7) << " flushsnap follows " << follows + << " client" << client << " on " << *in << dendl; + // this cap now follows a later snap (i.e. the one initiating this flush, or later) + cap->client_follows = follows+1; + + if (in->last <= follows) { + dout(10) << " flushsnap releasing cloned cap" << dendl; + in->remove_client_cap(client); + } else { + dout(10) << " flushsnap NOT releasing live cap" << dendl; + } + // we can prepare the ack now, since this FLUSHEDSNAP is independent of any + // other cap ops. (except possibly duplicate FLUSHSNAP requests, but worst + // case we get a dup response, so whatever.) MClientCaps *ack = 0; - capseq_t releasecap = 0; - - if (m->get_seq() < cap->get_last_sent()) { - /* client may be trying to release caps (i.e. inode closed, etc.) - * by setting reducing wanted set. but it may also be opening the - * same filename, not sure that it'll map to the same inode. so, - * we don't want RELEASE or wanted updates to clobber mds's notion - * of wanted unless we're sure the client has seen all the latest - * caps. - * - * we use last_sent here, not last_open, just to keep the client - * logic for deciding when to reply to a revocation simple. - */ - dout(10) << " ignoring release|wanted " << ccap_string(m->get_wanted()) - << " bc seq " << m->get_seq() << " < last sent " << cap->get_last_sent() << dendl; - } else if (m->get_op() == CEPH_CAP_OP_RELEASE) { - dout(7) << " release request client" << client << " seq " << m->get_seq() << " on " << *in << dendl; - releasecap = m->get_seq(); - /* - * if multiple release requests overlap (i.e. because the first one is waiting - * for the log to flush), wait for them all to "complete", and only ack the - * last one. to do this, keep count; see matching decrement in _finish_release_cap(). - */ - cap->releasing++; - ack = new MClientCaps(CEPH_CAP_OP_RELEASED, in->inode, 0, 0, 0, 0, 0); - } else if (wanted != cap->wanted()) { - dout(10) << " wanted " << ccap_string(cap->wanted()) - << " -> " << ccap_string(wanted) << dendl; - cap->set_wanted(wanted); + if (m->get_dirty()) { + ack = new MClientCaps(CEPH_CAP_OP_FLUSHSNAP_ACK, in->inode, 0, 0, 0, 0, m->get_dirty(), 0); + ack->set_snap_follows(follows); } - - if (((has|had) & CEPH_CAP_ANY_WR) == 0 || // didn't have any wr caps, - !_do_cap_update(in, has|had, in->get_caps_wanted() | wanted, // or didn't change anything - follows, m, ack, releasecap)) { - // no update, ack now. - if (releasecap) { - assert(ack); - _finish_release_cap(in, client, releasecap, ack); - } else if (ack) + if (!_do_cap_update(in, m->get_dirty(), 0, follows, m, ack)) { + if (ack) mds->send_message_client(ack, client); - eval_cap_gather(in); } + } else + dout(7) << " not auth, ignoring flushsnap on " << *in << dendl; + goto out; + } + + + // for this and all subsequent versions of this inode, + while (1) { + // filter wanted based on what we could ever give out (given auth/replica status) + int wanted = m->get_wanted() & head_in->get_caps_allowed_ever(); + int had = cap->confirm_receipt(m->get_seq(), m->get_caps()); + int has = cap->confirmed(); + dout(10) << " follows " << follows + << ", had " << ccap_string(had) + << ", has " << ccap_string(has) + << " on " << *in << dendl; + + MClientCaps *ack = 0; + capseq_t releasecap = 0; + + if (m->get_dirty() && in->is_auth()) { + dout(7) << " flush client" << client << " dirty " << ccap_string(m->get_dirty()) + << " seq " << m->get_seq() << " on " << *in << dendl; + ack = new MClientCaps(CEPH_CAP_OP_FLUSH_ACK, in->inode, 0, m->get_seq(), + m->get_caps(), 0, m->get_dirty(), 0); + } + if (m->get_caps() == 0) { + if (m->get_seq() < cap->get_last_issue()) { + dout(7) << " releasing request client" << client << " seq " << m->get_seq() << " on " << *in << dendl; + cap->releasing++; + releasecap = m->get_seq(); + } else { + dout(7) << " NOT releasing request client" << client << " seq " << m->get_seq() + << " (issue race) on " << *in << dendl; + } + } + if (wanted != cap->wanted()) { + dout(10) << " wanted " << ccap_string(cap->wanted()) + << " -> " << ccap_string(wanted) << dendl; + cap->set_wanted(wanted); + } + + if (!_do_cap_update(in, m->get_dirty(), m->get_wanted(), follows, m, ack, releasecap)) { + // no update, ack now. + if (releasecap) + _finish_release_cap(in, client, releasecap, ack); + else if (ack) + mds->send_message_client(ack, client); - // done? - if (in->last == CEPH_NOSNAP) - break; - - // next! - in = mdcache->pick_inode_snap(in, in->last); - cap = in->get_client_cap(client); - assert(cap); + eval_cap_gather(in); + if (in->filelock.is_stable()) + file_eval(&in->filelock); } + + // done? + if (in->last == CEPH_NOSNAP) + break; + + // next! + in = mdcache->pick_inode_snap(in, in->last); + cap = in->get_client_cap(client); + assert(cap); } - + + out: delete m; } @@ -1153,7 +1151,8 @@ void Locker::_finish_release_cap(CInode *in, int client, capseq_t seq, MClientCa if (!in->is_auth()) request_inode_file_caps(in); - mds->send_message_client(ack, client); + if (ack) + mds->send_message_client(ack, client); // unlinked stray? may need to purge (e.g., after all caps are released) if (in->inode.nlink == 0 && @@ -1165,140 +1164,140 @@ void Locker::_finish_release_cap(CInode *in, int client, capseq_t seq, MClientCa } } -bool Locker::_do_cap_update(CInode *in, int had, int all_wanted, snapid_t follows, MClientCaps *m, +/* + * update inode based on cap flush|flushsnap|wanted. + * adjust max_size, if needed. + * if we update, return true; otherwise, false (no updated needed). + */ +bool Locker::_do_cap_update(CInode *in, int dirty, int wanted, snapid_t follows, MClientCaps *m, MClientCaps *ack, capseq_t releasecap) { - dout(10) << "_do_cap_update had " << ccap_string(had) << " on " << *in << dendl; - + dout(10) << "_do_cap_update dirty " << ccap_string(dirty) + << " wanted " << ccap_string(wanted) + << " on " << *in << dendl; + assert(in->is_auth()); int client = m->get_source().num(); - inode_t *latest = in->get_projected_inode(); - utime_t atime = m->get_atime(); - utime_t mtime = m->get_mtime(); - utime_t ctime = m->get_ctime(); - uint64_t size = m->get_size(); - - // atime|mtime|size? - bool had_or_has_wr = had & (CEPH_CAP_GWR << CEPH_CAP_SFILE); - bool file_excl = had & (CEPH_CAP_GEXCL << CEPH_CAP_SFILE); - bool dirty_atime = false; - bool dirty_mtime = false; - bool dirty_ctime = false; - bool dirty_size = false; - bool dirty_file = false; - if (had_or_has_wr || file_excl) { - if (mtime > latest->mtime || (file_excl && mtime != latest->mtime)) - dirty_file = dirty_mtime = true; - if (ctime > latest->ctime) - dirty_file = dirty_ctime = true; - if (size > latest->size) - dirty_file = dirty_size = true; - } - if (file_excl && atime != latest->atime) - dirty_file = dirty_atime = true; - bool dirty_mode = false; - bool dirty_owner = false; - bool dirty_auth = false; - if (had & (CEPH_CAP_GEXCL << CEPH_CAP_SAUTH)) { - if (m->head.uid != latest->uid || - m->head.gid != latest->gid) - dirty_auth = dirty_owner = true; - if (m->head.mode != latest->mode) - dirty_auth = dirty_mode = true; - } - bool dirty = dirty_file || dirty_auth; - + if (in->is_base()) + return false; // FIXME? + // increase or zero max_size? + __u64 size = m->get_size(); bool change_max = false; uint64_t new_max = latest->max_size; - - if (in->is_auth()) { - if (latest->max_size && (all_wanted & CEPH_CAP_ANY_FILE_WR) == 0) { - change_max = true; - new_max = 0; - } - else if ((all_wanted & CEPH_CAP_ANY_FILE_WR) && - (size << 1) >= latest->max_size) { - dout(10) << "wr caps wanted, and size " << size - << " *2 >= max " << latest->max_size << ", increasing" << dendl; - change_max = true; - new_max = latest->max_size ? (latest->max_size << 1):in->get_layout_size_increment(); - } - if ((all_wanted & CEPH_CAP_ANY_FILE_WR) && - m->get_max_size() > new_max) { - dout(10) << "client requests file_max " << m->get_max_size() - << " > max " << latest->max_size << dendl; - change_max = true; - new_max = (m->get_max_size() << 1) & ~(in->get_layout_size_increment() - 1); - } - - if (change_max && !in->filelock.can_wrlock()) { - dout(10) << "want to change file_max, but lock won't allow it; will retry" << dendl; - check_inode_max_size(in); // this will fail, and schedule a waiter. - change_max = false; - } + + if (latest->max_size && (wanted & CEPH_CAP_ANY_FILE_WR) == 0) { + change_max = true; + new_max = 0; + } + else if ((wanted & CEPH_CAP_ANY_FILE_WR) && + (size << 1) >= latest->max_size) { + dout(10) << " wr caps wanted, and size " << size + << " *2 >= max " << latest->max_size << ", increasing" << dendl; + change_max = true; + new_max = latest->max_size ? (latest->max_size << 1):in->get_layout_size_increment(); + } + if ((wanted & CEPH_CAP_ANY_FILE_WR) && + m->get_max_size() > new_max) { + dout(10) << "client requests file_max " << m->get_max_size() + << " > max " << latest->max_size << dendl; + change_max = true; + new_max = (m->get_max_size() << 1) & ~(in->get_layout_size_increment() - 1); + } + if (change_max && + !in->filelock.can_wrlock() && + (dirty & (CEPH_CAP_FILE_WR|CEPH_CAP_FILE_EXCL)) == 0) { // not already writing dirty file data + dout(10) << " i want to change file_max, but lock won't allow it; will retry" << dendl; + check_inode_max_size(in); // this will fail, and schedule a waiter. + change_max = false; } - if (!(dirty || change_max) || - in->is_base()) { // FIXME.. what about root inode mtime/atime? + if (!dirty && !change_max) return false; - } + + // do the update. EUpdate *le = new EUpdate(mds->mdlog, "size|max_size|mtime|ctime|atime update"); - inode_t *pi = in->project_inode(); pi->version = in->pre_dirty(); + + Mutation *mut = new Mutation; + mut->ls = mds->mdlog->get_current_segment(); + if (change_max) { dout(7) << " max_size " << pi->max_size << " -> " << new_max << dendl; pi->max_size = new_max; - } - if (dirty_mtime) { - dout(7) << " mtime " << pi->mtime << " -> " << mtime - << " for " << *in << dendl; - pi->mtime = mtime; - } - if (dirty_ctime) { - dout(7) << " ctime " << pi->ctime << " -> " << ctime - << " for " << *in << dendl; - pi->ctime = ctime; } - if (dirty_size) { - dout(7) << " size " << pi->size << " -> " << size - << " for " << *in << dendl; - pi->size = size; - pi->rstat.rbytes = size; - } - if (dirty_atime) { - dout(7) << " atime " << pi->atime << " -> " << atime - << " for " << *in << dendl; - pi->atime = atime; - } - if (file_excl && pi->time_warp_seq < m->get_time_warp_seq()) { - dout(7) << " time_warp_seq " << pi->time_warp_seq << " -> " << m->get_time_warp_seq() - << " for " << *in << dendl; - pi->time_warp_seq = m->get_time_warp_seq(); - } - if (dirty_owner) { - dout(7) << " uid.gid " << pi->uid << "." << pi->gid - << " -> " << m->head.uid << "." << m->head.gid - << " for " << *in << dendl; - pi->uid = m->head.uid; - pi->gid = m->head.gid; + + // file + if (dirty & (CEPH_CAP_FILE_EXCL|CEPH_CAP_FILE_WR)) { + utime_t atime = m->get_atime(); + utime_t mtime = m->get_mtime(); + utime_t ctime = m->get_ctime(); + uint64_t size = m->get_size(); + + if (((dirty & CEPH_CAP_FILE_WR) && mtime > latest->mtime) || + ((dirty & CEPH_CAP_FILE_EXCL) && mtime != latest->mtime)) { + dout(7) << " mtime " << pi->mtime << " -> " << mtime + << " for " << *in << dendl; + pi->mtime = mtime; + } + if (ctime > latest->ctime) { + dout(7) << " ctime " << pi->ctime << " -> " << ctime + << " for " << *in << dendl; + pi->ctime = ctime; + } + if (size > latest->size) { + dout(7) << " size " << pi->size << " -> " << size + << " for " << *in << dendl; + pi->size = size; + pi->rstat.rbytes = size; + } + if ((dirty & CEPH_CAP_FILE_EXCL) && atime != latest->atime) { + dout(7) << " atime " << pi->atime << " -> " << atime + << " for " << *in << dendl; + pi->atime = atime; + } + if ((dirty & CEPH_CAP_FILE_EXCL) && pi->time_warp_seq < m->get_time_warp_seq()) { + dout(7) << " time_warp_seq " << pi->time_warp_seq << " -> " << m->get_time_warp_seq() + << " for " << *in << dendl; + pi->time_warp_seq = m->get_time_warp_seq(); + } } - if (dirty_mode) { - dout(7) << " mode " << oct << pi->mode - << " -> " << m->head.mode << dec + if (change_max) { + dout(7) << " max_size " << pi->max_size << " -> " << new_max << " for " << *in << dendl; - pi->mode = m->head.mode; + pi->max_size = new_max; } - - Mutation *mut = new Mutation; - mut->ls = mds->mdlog->get_current_segment(); - if (dirty_file) + + if (change_max || (dirty & (CEPH_CAP_FILE_EXCL|CEPH_CAP_FILE_WR))) file_wrlock_force(&in->filelock, mut); // wrlock for duration of journal - if (dirty_auth) + + // auth + if (dirty & CEPH_CAP_AUTH_EXCL) { + if (m->head.uid != latest->uid) { + dout(7) << " uid " << pi->uid + << " -> " << m->head.uid + << " for " << *in << dendl; + pi->uid = m->head.uid; + } + if (m->head.gid != latest->gid) { + dout(7) << " gid " << pi->gid + << " -> " << m->head.gid + << " for " << *in << dendl; + pi->gid = m->head.gid; + } + if (m->head.mode != latest->mode) { + dout(7) << " mode " << oct << pi->mode + << " -> " << m->head.mode << dec + << " for " << *in << dendl; + pi->mode = m->head.mode; + } + simple_wrlock_force(&in->authlock, mut); + } + mut->auth_pin(in); mdcache->predirty_journal_parents(mut, &le->metablob, in, 0, PREDIRTY_PRIMARY, 0, follows); mdcache->journal_dirty_inode(mut, &le->metablob, in, follows); diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc index da09849566dd4..843e8461c238c 100644 --- a/src/mds/MDCache.cc +++ b/src/mds/MDCache.cc @@ -3645,12 +3645,11 @@ void MDCache::do_cap_import(Session *session, CInode *in, Capability *cap) if (realm->have_past_parents_open()) { dout(10) << "do_cap_import " << session->inst.name << " mseq " << cap->get_mseq() << " on " << *in << dendl; MClientCaps *reap = new MClientCaps(CEPH_CAP_OP_IMPORT, - in->inode, - realm->inode->ino(), - cap->get_last_seq(), - cap->pending(), - cap->wanted(), - cap->get_mseq()); + in->inode, + realm->inode->ino(), + cap->get_last_seq(), + cap->pending(), cap->wanted(), 0, + cap->get_mseq()); realm->build_snap_trace(reap->snapbl); mds->send_message_client(reap, session->inst); } else { diff --git a/src/mds/Migrator.cc b/src/mds/Migrator.cc index 14c74418d8660..16c4f461c092c 100644 --- a/src/mds/Migrator.cc +++ b/src/mds/Migrator.cc @@ -904,12 +904,11 @@ void Migrator::finish_export_inode_caps(CInode *in) dout(7) << "finish_export_inode telling client" << it->first << " exported caps on " << *in << dendl; MClientCaps *m = new MClientCaps(CEPH_CAP_OP_EXPORT, - in->inode, - in->find_snaprealm()->inode->ino(), - cap->get_last_seq(), - cap->pending(), - cap->wanted(), - cap->get_mseq()); + in->inode, + in->find_snaprealm()->inode->ino(), + cap->get_last_seq(), + cap->pending(), cap->wanted(), 0, + cap->get_mseq()); mds->send_message_client(m, it->first); } in->clear_client_caps(); diff --git a/src/messages/MClientCaps.h b/src/messages/MClientCaps.h index d5effd89c3599..2cf789a986121 100644 --- a/src/messages/MClientCaps.h +++ b/src/messages/MClientCaps.h @@ -26,6 +26,7 @@ class MClientCaps : public Message { int get_caps() { return head.caps; } int get_wanted() { return head.wanted; } + int get_dirty() { return head.dirty; } capseq_t get_seq() { return head.seq; } capseq_t get_mseq() { return head.migrate_seq; } @@ -67,6 +68,7 @@ class MClientCaps : public Message { long seq, int caps, int wanted, + int dirty, int mseq) : Message(CEPH_MSG_CLIENT_CAPS) { memset(&head, 0, sizeof(head)); @@ -76,6 +78,7 @@ class MClientCaps : public Message { head.seq = seq; head.caps = caps; head.wanted = wanted; + head.dirty = dirty; head.migrate_seq = mseq; head.uid = inode.uid; @@ -112,16 +115,19 @@ class MClientCaps : public Message { << " ino " << inodeno_t(head.ino) << " seq " << head.seq << " caps=" << ccap_string(head.caps) - << " wanted=" << ccap_string(head.wanted) - << " size " << head.size << "/" << head.max_size; + << " dirty=" << ccap_string(head.dirty) + << " wanted=" << ccap_string(head.wanted); + out << " follows " << snapid_t(head.snap_follows); + if (head.migrate_seq) + out << " mseq " << head.migrate_seq; + + out << " size " << head.size << "/" << head.max_size; if (head.truncate_seq) out << " ts " << head.truncate_seq; out << " mtime " << utime_t(head.mtime); if (head.time_warp_seq) out << " tws " << head.time_warp_seq; - out << " follows " << snapid_t(head.snap_follows); - if (head.migrate_seq) - out << " mseq " << head.migrate_seq; + out << ")"; } -- 2.39.5