caps
-- notes
- - client should not get unsolicited MClientCaps if wanted==0.
- - if we do un-acked cap release, we need to handle unsolicited import/export
- - may unacked release _only_ if wanted==0?
-- maybe we really want a 'dirty' mask
- - client knows when it has dirty data to writeback, and thus when it must wait for it to flush?
+
+- two types of cap events from mds -> client
+ - cap issue .. in a reply, or an IMPORT cap op.
+ - cap update (revocation, etc.) .. an MClientCaps message.
+- if client has cap, mds should too.
+- if client has no dirty data, it can release without waiting for an ack.
+ - client may thus get a cap _update_ and not have the cap. ignore it.
+ - mds should track seq of last _issue_ (not update). any release attempt will only succeed if the client has seen the latest issue.
+ - if client gets an IMPORT issue and doesn't have the inode, immediately send a release.
+
+- a FLUSH writes dirty metadata. it also usually releases wr/excl caps.
+ - 'dirty' is which metadata is to be written.
+ - 'caps' is which caps are being retained by the client.
+? - if 0, we can discard the cap.
+ - client gets a FLUSH_ACK with matching dirty flags indicating which caps were written.
+- a FLUSH_ACK acks a FLUSH.
+ - 'dirty' is the _original_ FLUSH's dirty (i.e., which metadata was written back)
+ - 'seq' is the _original_ FLUSH's seq.
+ - 'caps' is the _original_ FLUSH's caps.
+- a FLUSHSNAP flushes snapshot metadata.
+ - 'dirty' indicates which caps, were dirty, if any.
+ - mds writes metadata. if dirty!=0, replies with FLUSHSNAP_ACK.
+- a RELEASE releases one or more (clean) caps.
+ - 'caps' is which caps are retained by the client.
+ - 'wanted' is which caps the client wants.
+ - dirty==0
+ - if caps==0, mds can close out the cap (provided there are no racing cap issues)
+- a WANTED simply updates the client's 'wanted' mask.
+ - 'wanted' is which caps the client wants.
+ - other fields should be ignored?
- kclient
- - only pin caps with wanted != 0?
- - put unwanted caps on an lru list; expire
+ - only pin caps with dirty metadata? and/or wanted != 0?
+ - and/or, put unwanted caps on an lru list, and expire?
- mds
- - segregate wanted/unwanted caps?
-
+ - segregate wanted/unwanted caps? rd/wr caps?
kernel client
}
if ((issued & CEPH_CAP_XATTR_EXCL) == 0) {
+#warning xattrs
in->xattrs.swap(st->xattrs);
}
if (in->snapdir_parent)
put_inode(in->snapdir_parent);
inode_map.erase(in->vino());
- in->cap_delay_item.remove_myself();
+ in->cap_item.remove_myself();
in->snaprealm_item.remove_myself();
if (in == root) root = 0;
delete in;
in->hold_caps_until = g_clock.now();
in->hold_caps_until += 5.0;
- delayed_caps.push_back(&in->cap_delay_item);
+ delayed_caps.push_back(&in->cap_item);
}
void Client::check_caps(Inode *in, bool is_delayed)
{
unsigned wanted = in->caps_wanted();
unsigned used = in->caps_used();
+ int like = wanted;
+ if (!unmounting)
+ like |= CEPH_CAP_ANY_RD;
dout(10) << "check_caps on " << *in
<< " wanted " << ccap_string(wanted)
InodeCap *cap = it->second;
it++;
- int like = wanted;
- if (!unmounting)
- like |= CEPH_CAP_ANY_RD;
-
int revoking = cap->implemented & ~cap->issued;
+ dout(10) << " cap mds" << mds
+ << " issued " << ccap_string(cap->issued)
+ << " implemented " << ccap_string(cap->implemented)
+ << " revoking " << ccap_string(revoking) << dendl;
+
if (in->wanted_max_size > in->inode.max_size &&
in->wanted_max_size > in->requested_max_size)
goto ack;
/* completed revocation? */
if (revoking && (revoking && used) == 0) {
- dout(10) << "completed revocation of " << (cap->implemented & ~cap->issued) << dendl;
+ dout(10) << "completed revocation of " << ccap_string(cap->implemented & ~cap->issued) << dendl;
goto ack;
}
+ if (!revoking && unmounting)
+ goto ack;
/* approaching file_max? */
if ((cap->issued & CEPH_CAP_FILE_WR) &&
}
ack:
- int op;
- if (wanted == 0)
+ int op = CEPH_CAP_OP_UPDATE;
+ if (like == 0)
op = CEPH_CAP_OP_RELEASE;
else
- op = CEPH_CAP_OP_ACK;
+ op = CEPH_CAP_OP_UPDATE;
+
+ cap->flushing |= in->caps_dirty() & cap->issued;
+ if (cap->flushing) {
+ dout(10) << " flushing " << ccap_string(cap->flushing) << dendl;
+ in->dirty_caps &= ~cap->flushing;
+ }
+ cap->issued &= like;
+
MClientCaps *m = new MClientCaps(op,
in->inode,
0,
cap->seq,
cap->issued,
wanted,
+ cap->flushing,
cap->mseq);
- cap->wanted = wanted;
in->reported_size = in->inode.size;
m->set_max_size(in->wanted_max_size);
in->requested_max_size = in->wanted_max_size;
m->set_snap_follows(in->snaprealm->get_snap_context().seq);
messenger->send_message(m, mdsmap->get_inst(mds));
+
+ if (cap->flushing == 0 && cap->issued == 0)
+ remove_cap(in, mds);
}
}
in->exporting_mseq = 0;
}
in->caps[mds] = cap = new InodeCap;
+ cap_list.push_back(&in->cap_item);
}
unsigned old_caps = cap->issued;
cap->seq = seq;
cap->mseq = mseq;
dout(10) << "add_update_cap issued " << ccap_string(old_caps) << " -> " << ccap_string(cap->issued)
- << " from mds" << mds << dendl;
+ << " from mds" << mds
+ << " on " << *in
+ << dendl;
if (issued & ~old_caps)
signal_cond_list(in->waitfor_caps);
void Client::remove_cap(Inode *in, int mds)
{
+ dout(10) << "remove_cap mds" << mds << " on " << *in << dendl;
assert(in->caps.count(mds));
in->caps.erase(mds);
if (in->caps.empty()) {
if (inode_map.count(vino)) in = inode_map[vino];
if (!in) {
dout(5) << "handle_caps don't have vino " << vino << dendl;
- assert(0); // shouldn't happen
- delete m;
+
+ if (m->get_op() == CEPH_CAP_OP_IMPORT) {
+ // release.
+ m->set_op(CEPH_CAP_OP_RELEASE);
+ m->head.caps = 0;
+ m->head.dirty = 0;
+ messenger->send_message(m, m->get_source_inst());
+ } else
+ delete m;
return;
}
case CEPH_CAP_OP_EXPORT: return handle_cap_export(in, m);
case CEPH_CAP_OP_TRUNC: return handle_cap_trunc(in, m);
case CEPH_CAP_OP_GRANT: return handle_cap_grant(in, m);
- case CEPH_CAP_OP_RELEASED: return handle_cap_released(in, m);
- case CEPH_CAP_OP_FLUSHEDSNAP: return handle_cap_flushedsnap(in, m);
+ case CEPH_CAP_OP_FLUSHSNAP_ACK: return handle_cap_flushsnap_ack(in, m);
+ case CEPH_CAP_OP_FLUSH_ACK: return handle_cap_flush_ack(in, m);
default:
delete m;
}
delete m;
}
-void Client::handle_cap_released(Inode *in, MClientCaps *m)
+void Client::handle_cap_flush_ack(Inode *in, MClientCaps *m)
{
int mds = m->get_source().num();
assert(in->caps[mds]);
+ InodeCap *cap = in->caps[mds];
- dout(5) << "handle_cap_released mds" << mds << " released cap on " << *in << dendl;
- remove_cap(in, mds);
+ int cleaned = m->get_dirty() & ~m->get_caps();
+ dout(5) << "handle_cap_flush_ack mds" << mds
+ << " cleaned " << ccap_string(cleaned) << " on " << *in << dendl;
+ cap->flushing &= ~cleaned;
+ dout(5) << " cap->flushing now " << ccap_string(cap->flushing)
+ << ", in->caps_dirty() now " << ccap_string(in->caps_dirty()) << dendl;
+
+ if (m->get_caps() == 0 &&
+ m->get_seq() == cap->seq) {
+ assert(in->caps_dirty() == 0);
+ remove_cap(in, mds);
+ }
delete m;
}
-void Client::handle_cap_flushedsnap(Inode *in, MClientCaps *m)
+void Client::handle_cap_flushsnap_ack(Inode *in, MClientCaps *m)
{
int mds = m->get_source().num();
assert(in->caps[mds]);
} else {
dout(5) << "handle_cap_flushedsnap DUP(?) mds" << mds << " flushed snap follows " << follows
<< " on " << *in << dendl;
- // we may not have it if we send multiple FLUSHSNAP requests and (get multiple FLUSHEDSNAPs back)
+ // we may not have it if we send multiple FLUSHSNAP requests and (got multiple FLUSHEDSNAPs back)
}
delete m;
void Client::handle_cap_grant(Inode *in, MClientCaps *m)
{
int mds = m->get_source().num();
- assert(in->caps[mds]);
+ if (in->caps.count(mds) == 0) {
+ dout(5) << "handle_cap_grant on ino " << m->get_ino() << " no cap for mds" << mds << dendl;
+ delete m;
+ return;
+ }
InodeCap *cap = in->caps[mds];
-
cap->seq = m->get_seq();
in->inode.layout = m->get_layout();
// don't want it?
- int wanted = in->caps_wanted();
/*
+ int wanted = in->caps_wanted();
if (wanted == 0) {
dout(5) << "handle_cap_grant on ino " << m->get_ino()
<< " seq " << m->get_seq()
<< " caps now " << ccap_string(new_caps)
<< " was " << ccap_string(old_caps) << dendl;
- // size/ctime/mtime/atime
+ // update inode
+ int issued = in->caps_issued();
+ if ((issued & CEPH_CAP_AUTH_EXCL) == 0) {
+ in->inode.mode = m->head.mode;
+ in->inode.uid = m->head.uid;
+ in->inode.gid = m->head.gid;
+ }
+ if ((issued & CEPH_CAP_LINK_EXCL) == 0) {
+ in->inode.nlink = m->head.nlink;
+ in->inode.anchored = false; /* lie */
+ }
+ if ((issued & CEPH_CAP_XATTR_EXCL) == 0) {
+#warning xattrs in->xattrs.swap(m->xattrs);
+ }
update_inode_file_bits(in, m->get_truncate_seq(), m->get_size(),
m->get_time_warp_seq(), m->get_ctime(), m->get_mtime(), m->get_atime(), old_caps);
}
// update caps
-
- bool ack = false;
-
if (old_caps & ~new_caps) {
dout(10) << " revocation of " << ccap_string(~new_caps & old_caps) << dendl;
cap->issued = new_caps;
if ((used & ~new_caps) & CEPH_CAP_FILE_WRBUFFER)
_flush(in);
- else {
- ack = true;
- cap->implemented = new_caps;
-
- // share our (possibly newer) file size, mtime, atime
- m->set_size(in->inode.size);
- m->set_max_size(0); // dont re-request
- m->set_mtime(in->inode.mtime);
- m->set_atime(in->inode.atime);
- m->set_wanted(wanted);
- cap->wanted = wanted;
- m->set_snap_follows(in->snaprealm->get_snap_context().seq);
- m->set_migrate_seq(cap->mseq);
- }
+ else
+ check_caps(in, false);
+
} else if (old_caps == new_caps) {
dout(10) << " caps unchanged at " << ccap_string(old_caps) << dendl;
} else {
if (new_caps)
signal_cond_list(in->waitfor_caps);
- if (ack)
- messenger->send_message(m, m->get_source_inst());
- else
- delete m;
+ delete m;
}
// NOTE: i'm assuming all caches are already flushing (because all files are closed).
assert(fd_map.empty());
- dout(10) << "a" << dendl;
-
_ll_drop_pins();
-
- dout(10) << "b" << dendl;
// empty lru cache
lru.lru_set_max(0);
check_caps(in, true);
}
+ // other caps, too
+ p = cap_list.begin();
+ while (!p.end()) {
+ Inode *in = *p;
+ ++p;
+ check_caps(in, true);
+ }
+
if (g_conf.client_oc) {
// release any/all caps
hash_map<vinodeno_t, Inode*>::iterator next;
if (in->hold_caps_until > now)
break;
delayed_caps.pop_front();
+ cap_list.push_back(&in->cap_item);
check_caps(in, true);
}
}
dn->inode->inode.mtime = mtime;
if (mask & CEPH_UTIME_ATIME)
dn->inode->inode.atime = atime;
+ dn->inode->dirty_caps |= CEPH_CAP_FILE_EXCL;
return 0;
}
// extend file?
if (totalwritten + offset > in->inode.size) {
in->inode.size = totalwritten + offset;
+ in->dirty_caps |= CEPH_CAP_FILE_WR;
if ((in->inode.size << 1) >= in->inode.max_size &&
(in->reported_size << 1) < in->inode.max_size)
// mtime
in->inode.mtime = g_clock.real_now();
+ in->dirty_caps |= CEPH_CAP_FILE_WR;
put_cap_ref(in, CEPH_CAP_FILE_WR);
unsigned issued;
unsigned implemented;
unsigned wanted; // as known to mds.
+ unsigned flushing;
__u64 seq;
__u32 mseq; // migration seq
- InodeCap() : issued(0), implemented(0), wanted(0), seq(0), mseq(0) {}
+ InodeCap() : issued(0), implemented(0), wanted(0), flushing(0), seq(0), mseq(0) {}
};
struct CapSnap {
// per-mds caps
map<int,InodeCap*> caps; // mds -> InodeCap
+ unsigned dirty_caps;
int snap_caps, snap_cap_refs;
unsigned exporting_issued;
int exporting_mds;
capseq_t exporting_mseq;
utime_t hold_caps_until;
- xlist<Inode*>::item cap_delay_item;
+ xlist<Inode*>::item cap_item;
SnapRealm *snaprealm;
xlist<Inode*>::item snaprealm_item;
snapid(vino.snapid),
lease_mask(0), lease_mds(-1),
dir_auth(-1), dir_hashed(false), dir_replicated(false),
+ dirty_caps(0),
snap_caps(0), snap_cap_refs(0),
exporting_issued(0), exporting_mds(-1), exporting_mseq(0),
- cap_delay_item(this),
+ cap_item(this),
snaprealm(0), snaprealm_item(this), snapdir_parent(0),
reported_size(0), wanted_max_size(0), requested_max_size(0),
ref(0), ll_ref(0),
want |= CEPH_CAP_FILE_EXCL;
return want;
}
+ int caps_dirty() {
+ int flushing = dirty_caps;
+ for (map<int,InodeCap*>::iterator it = caps.begin();
+ it != caps.end();
+ it++)
+ flushing |= it->second->flushing;
+ return flushing;
+ }
bool have_valid_size() {
// RD+RDCACHE or WR+WRBUFFER => valid size
Inode* root;
LRU lru; // lru list of Dentry's in our local metadata cache.
- xlist<Inode*> delayed_caps;
+ // all inodes with caps sit on either cap_list or delayed_caps.
+ xlist<Inode*> delayed_caps, cap_list;
hash_map<inodeno_t,SnapRealm*> snap_realms;
SnapRealm *get_snap_realm(inodeno_t r) {
void handle_cap_import(Inode *in, class MClientCaps *m);
void handle_cap_export(Inode *in, class MClientCaps *m);
void handle_cap_trunc(Inode *in, class MClientCaps *m);
- void handle_cap_released(Inode *in, class MClientCaps *m);
- void handle_cap_flushedsnap(Inode *in, class MClientCaps *m);
+ void handle_cap_flush_ack(Inode *in, class MClientCaps *m);
+ void handle_cap_flushsnap_ack(Inode *in, class MClientCaps *m);
void handle_cap_grant(Inode *in, class MClientCaps *m);
void cap_delay_requeue(Inode *in);
void check_caps(Inode *in, bool is_delayed);
CEPH_CAP_FILE_RDCACHE | \
CEPH_CAP_XATTR_RDCACHE)
-#define CEPH_CAP_ANY_RD (CEPH_CAP_AUTH_RDCACHE | \
- CEPH_CAP_LINK_RDCACHE | \
- CEPH_CAP_XATTR_RDCACHE | \
- CEPH_CAP_FILE_RDCACHE | CEPH_CAP_FILE_RD)
+#define CEPH_CAP_ANY_RDCACHE (CEPH_CAP_AUTH_RDCACHE | \
+ CEPH_CAP_LINK_RDCACHE | \
+ CEPH_CAP_XATTR_RDCACHE | \
+ CEPH_CAP_FILE_RDCACHE)
+#define CEPH_CAP_ANY_RD (CEPH_CAP_ANY_RDCACHE | CEPH_CAP_FILE_RD)
#define CEPH_CAP_ANY_EXCL (CEPH_CAP_AUTH_EXCL | \
CEPH_CAP_LINK_EXCL | \
CEPH_CAP_OP_TRUNC, /* mds->client trunc notify */
CEPH_CAP_OP_EXPORT, /* mds has exported the cap */
CEPH_CAP_OP_IMPORT, /* mds has imported the cap from specified mds */
- CEPH_CAP_OP_RELEASED, /* mds->client close out cap */
- CEPH_CAP_OP_FLUSHEDSNAP, /* mds->client flushed snap */
- CEPH_CAP_OP_ACK, /* client->mds ack (if prior grant was recall) */
- CEPH_CAP_OP_REQUEST, /* client->mds request (update wanted bits) */
+ CEPH_CAP_OP_UPDATE, /* client->mds update */
+ CEPH_CAP_OP_FLUSH_ACK, /* mds->client flushed. if caps=0, cap also released. */
CEPH_CAP_OP_FLUSHSNAP, /* client->mds flush snapped metadata */
- CEPH_CAP_OP_RELEASE, /* client->mds request release cap */
+ CEPH_CAP_OP_FLUSHSNAP_ACK, /* mds->client flushed snapped metadata */
+ CEPH_CAP_OP_RELEASE, /* client->mds release (clean) cap */
};
static inline const char *ceph_cap_op_name(int op)
case CEPH_CAP_OP_TRUNC: return "trunc";
case CEPH_CAP_OP_EXPORT: return "export";
case CEPH_CAP_OP_IMPORT: return "import";
- case CEPH_CAP_OP_RELEASED: return "released";
- case CEPH_CAP_OP_FLUSHEDSNAP: return "flushedsnap";
- case CEPH_CAP_OP_ACK: return "ack";
- case CEPH_CAP_OP_REQUEST: return "request";
+ case CEPH_CAP_OP_UPDATE: return "update";
+ case CEPH_CAP_OP_FLUSH_ACK: return "flush_ack";
case CEPH_CAP_OP_FLUSHSNAP: return "flushsnap";
+ case CEPH_CAP_OP_FLUSHSNAP_ACK: return "flushsnap_ack";
case CEPH_CAP_OP_RELEASE: return "release";
default: return "???";
}
__le32 op;
__le64 ino, realm;
__le32 seq;
- __le32 caps, wanted;
+ __le32 caps, wanted, dirty;
__le32 migrate_seq;
__le64 snap_follows;
__le32 snap_trace_len;
}
/* completed revocation? going down and there are no caps? */
- if ((revoking && (revoking & used) == 0) ||
- (mdsc->stopping && (used == 0))) {
+ if (revoking && (revoking & used) == 0) {
dout(10, "completed revocation of %d\n",
cap->implemented & ~cap->issued);
goto ack;
}
+ /* shutting down? */
+ if (!revoking && mdsc->stopping && (used == 0))
+ goto ack;
+
if ((cap->issued & ~(file_wanted | used)) == 0)
continue; /* nothing extra, all good */
}
if (cap && valid) {
bool loner = (get_loner() == client);
- int issue = (cap->wanted() | CEPH_CAP_ANY_RD) & get_caps_allowed(loner);
+ int likes = get_caps_liked();
+ int issue = (cap->wanted() | likes) & get_caps_allowed(loner);
int pending = cap->pending();
if (issue & ~pending) {
dout(10) << "encode_inodestat issuing " << ccap_string(issue)
}
// caps allowed
+ int get_caps_liked() {
+ return CEPH_CAP_PIN |
+ (is_dir() ? CEPH_CAP_ANY_RDCACHE : CEPH_CAP_ANY_RD);
+ }
int get_caps_allowed_ever() {
return
CEPH_CAP_PIN |
#include "config.h"
+
+/*
+
+ Capability protocol notes.
+
+- two types of cap events from mds -> client:
+ - cap "issue" in a MClientReply, or an MClientCaps IMPORT op.
+ - cap "update" (revocation, etc.) .. an MClientCaps message.
+- if client has cap, the mds should have it too.
+
+- if client has no dirty data, it can release it without waiting for an mds ack.
+ - client may thus get a cap _update_ and not have the cap. ignore it.
+
+- mds should track seq of last _issue_ (not update). any release
+ attempt will only succeed if the client has seen the latest issue.
+ - if client gets an IMPORT issue and doesn't have the inode, immediately send a release.
+
+- a UPDATE updates the clients issued caps, wanted, etc. it may also flush dirty metadata.
+ - 'caps' are which caps the client retains.
+ - if 0, client wishes to release the cap
+ - 'wanted' is which caps the client wants.
+ - 'dirty' is which metadata is to be written.
+ - client gets a FLUSH_ACK with matching dirty flags indicating which caps were written.
+
+- a FLUSH_ACK acks a FLUSH.
+ - 'dirty' is the _original_ FLUSH's dirty (i.e., which metadata was written back)
+ - 'seq' is the _original_ FLUSH's seq.
+ - 'caps' is the _original_ FLUSH's caps.
+ - client can conclude that (dirty & ~caps) bits were successfully cleaned.
+
+- a FLUSHSNAP flushes snapshot metadata.
+ - 'dirty' indicates which caps, were dirty, if any.
+ - mds writes metadata. if dirty!=0, replies with FLUSHSNAP_ACK.
+
+- a RELEASE releases one or more (clean) caps.
+ - 'caps' is which caps are retained by the client.
+ - 'wanted' is which caps the client wants.
+ - dirty==0
+ - if caps==0, mds can close out the cap (provided there are no racing cap issues)
+
+ */
+
class CInode;
class Capability {
map<capseq_t, __u32> cap_history; // seq -> cap, [last_recv,last_sent]
capseq_t last_sent, last_recv;
- capseq_t last_open;
+ capseq_t last_issue;
capseq_t mseq;
int suppress;
wanted_caps(want),
last_sent(s),
last_recv(s),
- last_open(0),
+ last_issue(0),
mseq(0),
suppress(0), stale(false), releasing(0),
client_follows(0),
capseq_t get_mseq() { return mseq; }
capseq_t get_last_sent() { return last_sent; }
- capseq_t get_last_open() { return last_open; }
- void set_last_open() { last_open = last_sent; }
+
+ capseq_t get_last_issue() { return last_issue; }
bool is_suppress() { return suppress > 0; }
void inc_suppress() { suppress++; }
// issue caps; return seq number.
capseq_t issue(int c) {
- ++last_sent;
+ last_issue = ++last_sent;
cap_history[last_sent] = c;
return last_sent;
}
// re-issue whatever we can
//cap->issue(cap->pending());
- //cap->set_last_open(); // not used, atm.
return cap;
}
nissued++;
// include caps that clients generally like, while we're at it.
- int likes = CEPH_CAP_ANY_RD;
-
+ int likes = in->get_caps_liked();
int before = pending;
long seq = cap->issue((wanted|likes) & allowed);
int after = cap->pending();
in->inode,
in->find_snaprealm()->inode->ino(),
cap->get_last_seq(),
- after,
- wanted,
+ after, wanted, 0,
cap->get_mseq()),
it->first);
}
in->inode,
in->find_snaprealm()->inode->ino(),
cap->get_last_seq(),
- cap->pending(),
- cap->wanted(),
+ cap->pending(), cap->wanted(), 0,
cap->get_mseq()),
it->first);
}
if (cap->pending() & (CEPH_CAP_GWR<<CEPH_CAP_SFILE)) {
dout(10) << "share_inode_max_size with client" << client << dendl;
mds->send_message_client(new MClientCaps(CEPH_CAP_OP_GRANT,
- in->inode,
- in->find_snaprealm()->inode->ino(),
- cap->get_last_seq(),
- cap->pending(),
- cap->wanted(),
- cap->get_mseq()),
+ in->inode,
+ in->find_snaprealm()->inode->ino(),
+ cap->get_last_seq(),
+ cap->pending(), cap->wanted(), 0,
+ cap->get_mseq()),
client);
}
}
return;
}
- // flushsnap?
- if (m->get_op() == CEPH_CAP_OP_FLUSHSNAP) {
- dout(7) << " flushsnap follows " << follows
- << " client" << client << " on " << *in << dendl;
- // this cap now follows a later snap (i.e. the one initiating this flush, or later)
- cap->client_follows = follows+1;
-
- if (in->last <= follows) {
- dout(10) << " flushsnap releasing cloned cap" << dendl;
- in->remove_client_cap(client);
- } else {
- dout(10) << " flushsnap NOT releasing live cap" << dendl;
- }
-
- // we can prepare the ack now, since this FLUSHEDSNAP is independent of any
- // other cap ops. (except possibly duplicate FLUSHSNAP requests, but worst
- // case we get a dup response, so whatever.)
- MClientCaps *ack = new MClientCaps(CEPH_CAP_OP_FLUSHEDSNAP, in->inode, 0, 0, 0, 0, 0);
- ack->set_snap_follows(follows);
- if (!_do_cap_update(in, m->get_caps(), in->get_caps_wanted(), follows, m, ack)) {
- mds->send_message_client(ack, client);
- eval_cap_gather(in);
- }
- } else {
+ int op = m->get_op();
- // for this and all subsequent versions of this inode,
- while (1) {
- // filter wanted based on what we could ever give out (given auth/replica status)
- int wanted = m->get_wanted() & head_in->get_caps_allowed_ever();
- int had = cap->confirm_receipt(m->get_seq(), m->get_caps());
- int has = cap->confirmed();
- dout(10) << " follows " << follows
- << ", had " << ccap_string(had)
- << ", has " << ccap_string(has)
- << " on " << *in << dendl;
+ // flushsnap?
+ if (op == CEPH_CAP_OP_FLUSHSNAP) {
+ if (in->is_auth()) {
+ dout(7) << " flushsnap follows " << follows
+ << " client" << client << " on " << *in << dendl;
+ // this cap now follows a later snap (i.e. the one initiating this flush, or later)
+ cap->client_follows = follows+1;
+
+ if (in->last <= follows) {
+ dout(10) << " flushsnap releasing cloned cap" << dendl;
+ in->remove_client_cap(client);
+ } else {
+ dout(10) << " flushsnap NOT releasing live cap" << dendl;
+ }
+ // we can prepare the ack now, since this FLUSHEDSNAP is independent of any
+ // other cap ops. (except possibly duplicate FLUSHSNAP requests, but worst
+ // case we get a dup response, so whatever.)
MClientCaps *ack = 0;
- capseq_t releasecap = 0;
-
- if (m->get_seq() < cap->get_last_sent()) {
- /* client may be trying to release caps (i.e. inode closed, etc.)
- * by setting reducing wanted set. but it may also be opening the
- * same filename, not sure that it'll map to the same inode. so,
- * we don't want RELEASE or wanted updates to clobber mds's notion
- * of wanted unless we're sure the client has seen all the latest
- * caps.
- *
- * we use last_sent here, not last_open, just to keep the client
- * logic for deciding when to reply to a revocation simple.
- */
- dout(10) << " ignoring release|wanted " << ccap_string(m->get_wanted())
- << " bc seq " << m->get_seq() << " < last sent " << cap->get_last_sent() << dendl;
- } else if (m->get_op() == CEPH_CAP_OP_RELEASE) {
- dout(7) << " release request client" << client << " seq " << m->get_seq() << " on " << *in << dendl;
- releasecap = m->get_seq();
- /*
- * if multiple release requests overlap (i.e. because the first one is waiting
- * for the log to flush), wait for them all to "complete", and only ack the
- * last one. to do this, keep count; see matching decrement in _finish_release_cap().
- */
- cap->releasing++;
- ack = new MClientCaps(CEPH_CAP_OP_RELEASED, in->inode, 0, 0, 0, 0, 0);
- } else if (wanted != cap->wanted()) {
- dout(10) << " wanted " << ccap_string(cap->wanted())
- << " -> " << ccap_string(wanted) << dendl;
- cap->set_wanted(wanted);
+ if (m->get_dirty()) {
+ ack = new MClientCaps(CEPH_CAP_OP_FLUSHSNAP_ACK, in->inode, 0, 0, 0, 0, m->get_dirty(), 0);
+ ack->set_snap_follows(follows);
}
-
- if (((has|had) & CEPH_CAP_ANY_WR) == 0 || // didn't have any wr caps,
- !_do_cap_update(in, has|had, in->get_caps_wanted() | wanted, // or didn't change anything
- follows, m, ack, releasecap)) {
- // no update, ack now.
- if (releasecap) {
- assert(ack);
- _finish_release_cap(in, client, releasecap, ack);
- } else if (ack)
+ if (!_do_cap_update(in, m->get_dirty(), 0, follows, m, ack)) {
+ if (ack)
mds->send_message_client(ack, client);
-
eval_cap_gather(in);
}
+ } else
+ dout(7) << " not auth, ignoring flushsnap on " << *in << dendl;
+ goto out;
+ }
+
+
+ // for this and all subsequent versions of this inode,
+ while (1) {
+ // filter wanted based on what we could ever give out (given auth/replica status)
+ int wanted = m->get_wanted() & head_in->get_caps_allowed_ever();
+ int had = cap->confirm_receipt(m->get_seq(), m->get_caps());
+ int has = cap->confirmed();
+ dout(10) << " follows " << follows
+ << ", had " << ccap_string(had)
+ << ", has " << ccap_string(has)
+ << " on " << *in << dendl;
+
+ MClientCaps *ack = 0;
+ capseq_t releasecap = 0;
+
+ if (m->get_dirty() && in->is_auth()) {
+ dout(7) << " flush client" << client << " dirty " << ccap_string(m->get_dirty())
+ << " seq " << m->get_seq() << " on " << *in << dendl;
+ ack = new MClientCaps(CEPH_CAP_OP_FLUSH_ACK, in->inode, 0, m->get_seq(),
+ m->get_caps(), 0, m->get_dirty(), 0);
+ }
+ if (m->get_caps() == 0) {
+ if (m->get_seq() < cap->get_last_issue()) {
+ dout(7) << " releasing request client" << client << " seq " << m->get_seq() << " on " << *in << dendl;
+ cap->releasing++;
+ releasecap = m->get_seq();
+ } else {
+ dout(7) << " NOT releasing request client" << client << " seq " << m->get_seq()
+ << " (issue race) on " << *in << dendl;
+ }
+ }
+ if (wanted != cap->wanted()) {
+ dout(10) << " wanted " << ccap_string(cap->wanted())
+ << " -> " << ccap_string(wanted) << dendl;
+ cap->set_wanted(wanted);
+ }
+
+ if (!_do_cap_update(in, m->get_dirty(), m->get_wanted(), follows, m, ack, releasecap)) {
+ // no update, ack now.
+ if (releasecap)
+ _finish_release_cap(in, client, releasecap, ack);
+ else if (ack)
+ mds->send_message_client(ack, client);
- // done?
- if (in->last == CEPH_NOSNAP)
- break;
-
- // next!
- in = mdcache->pick_inode_snap(in, in->last);
- cap = in->get_client_cap(client);
- assert(cap);
+ eval_cap_gather(in);
+ if (in->filelock.is_stable())
+ file_eval(&in->filelock);
}
+
+ // done?
+ if (in->last == CEPH_NOSNAP)
+ break;
+
+ // next!
+ in = mdcache->pick_inode_snap(in, in->last);
+ cap = in->get_client_cap(client);
+ assert(cap);
}
-
+
+ out:
delete m;
}
if (!in->is_auth())
request_inode_file_caps(in);
- mds->send_message_client(ack, client);
+ if (ack)
+ mds->send_message_client(ack, client);
// unlinked stray? may need to purge (e.g., after all caps are released)
if (in->inode.nlink == 0 &&
}
}
-bool Locker::_do_cap_update(CInode *in, int had, int all_wanted, snapid_t follows, MClientCaps *m,
+/*
+ * update inode based on cap flush|flushsnap|wanted.
+ * adjust max_size, if needed.
+ * if we update, return true; otherwise, false (no updated needed).
+ */
+bool Locker::_do_cap_update(CInode *in, int dirty, int wanted, snapid_t follows, MClientCaps *m,
MClientCaps *ack, capseq_t releasecap)
{
- dout(10) << "_do_cap_update had " << ccap_string(had) << " on " << *in << dendl;
-
+ dout(10) << "_do_cap_update dirty " << ccap_string(dirty)
+ << " wanted " << ccap_string(wanted)
+ << " on " << *in << dendl;
+ assert(in->is_auth());
int client = m->get_source().num();
-
inode_t *latest = in->get_projected_inode();
- utime_t atime = m->get_atime();
- utime_t mtime = m->get_mtime();
- utime_t ctime = m->get_ctime();
- uint64_t size = m->get_size();
-
- // atime|mtime|size?
- bool had_or_has_wr = had & (CEPH_CAP_GWR << CEPH_CAP_SFILE);
- bool file_excl = had & (CEPH_CAP_GEXCL << CEPH_CAP_SFILE);
- bool dirty_atime = false;
- bool dirty_mtime = false;
- bool dirty_ctime = false;
- bool dirty_size = false;
- bool dirty_file = false;
- if (had_or_has_wr || file_excl) {
- if (mtime > latest->mtime || (file_excl && mtime != latest->mtime))
- dirty_file = dirty_mtime = true;
- if (ctime > latest->ctime)
- dirty_file = dirty_ctime = true;
- if (size > latest->size)
- dirty_file = dirty_size = true;
- }
- if (file_excl && atime != latest->atime)
- dirty_file = dirty_atime = true;
- bool dirty_mode = false;
- bool dirty_owner = false;
- bool dirty_auth = false;
- if (had & (CEPH_CAP_GEXCL << CEPH_CAP_SAUTH)) {
- if (m->head.uid != latest->uid ||
- m->head.gid != latest->gid)
- dirty_auth = dirty_owner = true;
- if (m->head.mode != latest->mode)
- dirty_auth = dirty_mode = true;
- }
- bool dirty = dirty_file || dirty_auth;
-
+ if (in->is_base())
+ return false; // FIXME?
+
// increase or zero max_size?
+ __u64 size = m->get_size();
bool change_max = false;
uint64_t new_max = latest->max_size;
-
- if (in->is_auth()) {
- if (latest->max_size && (all_wanted & CEPH_CAP_ANY_FILE_WR) == 0) {
- change_max = true;
- new_max = 0;
- }
- else if ((all_wanted & CEPH_CAP_ANY_FILE_WR) &&
- (size << 1) >= latest->max_size) {
- dout(10) << "wr caps wanted, and size " << size
- << " *2 >= max " << latest->max_size << ", increasing" << dendl;
- change_max = true;
- new_max = latest->max_size ? (latest->max_size << 1):in->get_layout_size_increment();
- }
- if ((all_wanted & CEPH_CAP_ANY_FILE_WR) &&
- m->get_max_size() > new_max) {
- dout(10) << "client requests file_max " << m->get_max_size()
- << " > max " << latest->max_size << dendl;
- change_max = true;
- new_max = (m->get_max_size() << 1) & ~(in->get_layout_size_increment() - 1);
- }
-
- if (change_max && !in->filelock.can_wrlock()) {
- dout(10) << "want to change file_max, but lock won't allow it; will retry" << dendl;
- check_inode_max_size(in); // this will fail, and schedule a waiter.
- change_max = false;
- }
+
+ if (latest->max_size && (wanted & CEPH_CAP_ANY_FILE_WR) == 0) {
+ change_max = true;
+ new_max = 0;
+ }
+ else if ((wanted & CEPH_CAP_ANY_FILE_WR) &&
+ (size << 1) >= latest->max_size) {
+ dout(10) << " wr caps wanted, and size " << size
+ << " *2 >= max " << latest->max_size << ", increasing" << dendl;
+ change_max = true;
+ new_max = latest->max_size ? (latest->max_size << 1):in->get_layout_size_increment();
+ }
+ if ((wanted & CEPH_CAP_ANY_FILE_WR) &&
+ m->get_max_size() > new_max) {
+ dout(10) << "client requests file_max " << m->get_max_size()
+ << " > max " << latest->max_size << dendl;
+ change_max = true;
+ new_max = (m->get_max_size() << 1) & ~(in->get_layout_size_increment() - 1);
+ }
+ if (change_max &&
+ !in->filelock.can_wrlock() &&
+ (dirty & (CEPH_CAP_FILE_WR|CEPH_CAP_FILE_EXCL)) == 0) { // not already writing dirty file data
+ dout(10) << " i want to change file_max, but lock won't allow it; will retry" << dendl;
+ check_inode_max_size(in); // this will fail, and schedule a waiter.
+ change_max = false;
}
- if (!(dirty || change_max) ||
- in->is_base()) { // FIXME.. what about root inode mtime/atime?
+ if (!dirty && !change_max)
return false;
- }
+
+ // do the update.
EUpdate *le = new EUpdate(mds->mdlog, "size|max_size|mtime|ctime|atime update");
-
inode_t *pi = in->project_inode();
pi->version = in->pre_dirty();
+
+ Mutation *mut = new Mutation;
+ mut->ls = mds->mdlog->get_current_segment();
+
if (change_max) {
dout(7) << " max_size " << pi->max_size << " -> " << new_max << dendl;
pi->max_size = new_max;
- }
- if (dirty_mtime) {
- dout(7) << " mtime " << pi->mtime << " -> " << mtime
- << " for " << *in << dendl;
- pi->mtime = mtime;
- }
- if (dirty_ctime) {
- dout(7) << " ctime " << pi->ctime << " -> " << ctime
- << " for " << *in << dendl;
- pi->ctime = ctime;
}
- if (dirty_size) {
- dout(7) << " size " << pi->size << " -> " << size
- << " for " << *in << dendl;
- pi->size = size;
- pi->rstat.rbytes = size;
- }
- if (dirty_atime) {
- dout(7) << " atime " << pi->atime << " -> " << atime
- << " for " << *in << dendl;
- pi->atime = atime;
- }
- if (file_excl && pi->time_warp_seq < m->get_time_warp_seq()) {
- dout(7) << " time_warp_seq " << pi->time_warp_seq << " -> " << m->get_time_warp_seq()
- << " for " << *in << dendl;
- pi->time_warp_seq = m->get_time_warp_seq();
- }
- if (dirty_owner) {
- dout(7) << " uid.gid " << pi->uid << "." << pi->gid
- << " -> " << m->head.uid << "." << m->head.gid
- << " for " << *in << dendl;
- pi->uid = m->head.uid;
- pi->gid = m->head.gid;
+
+ // file
+ if (dirty & (CEPH_CAP_FILE_EXCL|CEPH_CAP_FILE_WR)) {
+ utime_t atime = m->get_atime();
+ utime_t mtime = m->get_mtime();
+ utime_t ctime = m->get_ctime();
+ uint64_t size = m->get_size();
+
+ if (((dirty & CEPH_CAP_FILE_WR) && mtime > latest->mtime) ||
+ ((dirty & CEPH_CAP_FILE_EXCL) && mtime != latest->mtime)) {
+ dout(7) << " mtime " << pi->mtime << " -> " << mtime
+ << " for " << *in << dendl;
+ pi->mtime = mtime;
+ }
+ if (ctime > latest->ctime) {
+ dout(7) << " ctime " << pi->ctime << " -> " << ctime
+ << " for " << *in << dendl;
+ pi->ctime = ctime;
+ }
+ if (size > latest->size) {
+ dout(7) << " size " << pi->size << " -> " << size
+ << " for " << *in << dendl;
+ pi->size = size;
+ pi->rstat.rbytes = size;
+ }
+ if ((dirty & CEPH_CAP_FILE_EXCL) && atime != latest->atime) {
+ dout(7) << " atime " << pi->atime << " -> " << atime
+ << " for " << *in << dendl;
+ pi->atime = atime;
+ }
+ if ((dirty & CEPH_CAP_FILE_EXCL) && pi->time_warp_seq < m->get_time_warp_seq()) {
+ dout(7) << " time_warp_seq " << pi->time_warp_seq << " -> " << m->get_time_warp_seq()
+ << " for " << *in << dendl;
+ pi->time_warp_seq = m->get_time_warp_seq();
+ }
}
- if (dirty_mode) {
- dout(7) << " mode " << oct << pi->mode
- << " -> " << m->head.mode << dec
+ if (change_max) {
+ dout(7) << " max_size " << pi->max_size << " -> " << new_max
<< " for " << *in << dendl;
- pi->mode = m->head.mode;
+ pi->max_size = new_max;
}
-
- Mutation *mut = new Mutation;
- mut->ls = mds->mdlog->get_current_segment();
- if (dirty_file)
+
+ if (change_max || (dirty & (CEPH_CAP_FILE_EXCL|CEPH_CAP_FILE_WR)))
file_wrlock_force(&in->filelock, mut); // wrlock for duration of journal
- if (dirty_auth)
+
+ // auth
+ if (dirty & CEPH_CAP_AUTH_EXCL) {
+ if (m->head.uid != latest->uid) {
+ dout(7) << " uid " << pi->uid
+ << " -> " << m->head.uid
+ << " for " << *in << dendl;
+ pi->uid = m->head.uid;
+ }
+ if (m->head.gid != latest->gid) {
+ dout(7) << " gid " << pi->gid
+ << " -> " << m->head.gid
+ << " for " << *in << dendl;
+ pi->gid = m->head.gid;
+ }
+ if (m->head.mode != latest->mode) {
+ dout(7) << " mode " << oct << pi->mode
+ << " -> " << m->head.mode << dec
+ << " for " << *in << dendl;
+ pi->mode = m->head.mode;
+ }
+
simple_wrlock_force(&in->authlock, mut);
+ }
+
mut->auth_pin(in);
mdcache->predirty_journal_parents(mut, &le->metablob, in, 0, PREDIRTY_PRIMARY, 0, follows);
mdcache->journal_dirty_inode(mut, &le->metablob, in, follows);
if (realm->have_past_parents_open()) {
dout(10) << "do_cap_import " << session->inst.name << " mseq " << cap->get_mseq() << " on " << *in << dendl;
MClientCaps *reap = new MClientCaps(CEPH_CAP_OP_IMPORT,
- in->inode,
- realm->inode->ino(),
- cap->get_last_seq(),
- cap->pending(),
- cap->wanted(),
- cap->get_mseq());
+ in->inode,
+ realm->inode->ino(),
+ cap->get_last_seq(),
+ cap->pending(), cap->wanted(), 0,
+ cap->get_mseq());
realm->build_snap_trace(reap->snapbl);
mds->send_message_client(reap, session->inst);
} else {
dout(7) << "finish_export_inode telling client" << it->first
<< " exported caps on " << *in << dendl;
MClientCaps *m = new MClientCaps(CEPH_CAP_OP_EXPORT,
- in->inode,
- in->find_snaprealm()->inode->ino(),
- cap->get_last_seq(),
- cap->pending(),
- cap->wanted(),
- cap->get_mseq());
+ in->inode,
+ in->find_snaprealm()->inode->ino(),
+ cap->get_last_seq(),
+ cap->pending(), cap->wanted(), 0,
+ cap->get_mseq());
mds->send_message_client(m, it->first);
}
in->clear_client_caps();
int get_caps() { return head.caps; }
int get_wanted() { return head.wanted; }
+ int get_dirty() { return head.dirty; }
capseq_t get_seq() { return head.seq; }
capseq_t get_mseq() { return head.migrate_seq; }
long seq,
int caps,
int wanted,
+ int dirty,
int mseq) :
Message(CEPH_MSG_CLIENT_CAPS) {
memset(&head, 0, sizeof(head));
head.seq = seq;
head.caps = caps;
head.wanted = wanted;
+ head.dirty = dirty;
head.migrate_seq = mseq;
head.uid = inode.uid;
<< " ino " << inodeno_t(head.ino)
<< " seq " << head.seq
<< " caps=" << ccap_string(head.caps)
- << " wanted=" << ccap_string(head.wanted)
- << " size " << head.size << "/" << head.max_size;
+ << " dirty=" << ccap_string(head.dirty)
+ << " wanted=" << ccap_string(head.wanted);
+ out << " follows " << snapid_t(head.snap_follows);
+ if (head.migrate_seq)
+ out << " mseq " << head.migrate_seq;
+
+ out << " size " << head.size << "/" << head.max_size;
if (head.truncate_seq)
out << " ts " << head.truncate_seq;
out << " mtime " << utime_t(head.mtime);
if (head.time_warp_seq)
out << " tws " << head.time_warp_seq;
- out << " follows " << snapid_t(head.snap_follows);
- if (head.migrate_seq)
- out << " mseq " << head.migrate_seq;
+
out << ")";
}