Inode *in = p->second;
if (in->caps.count(mds)) {
dout(10) << " caps on " << p->first
- << " " << cap_string(in->caps[mds].issued)
+ << " " << cap_string(in->caps[mds]->issued)
<< " wants " << cap_string(in->caps_wanted())
<< dendl;
- in->caps[mds].seq = 0; // reset seq.
+ in->caps[mds]->seq = 0; // reset seq.
m->add_inode_caps(p->first, // ino
in->caps_wanted(), // wanted
- in->caps[mds].issued, // issued
+ in->caps[mds]->issued, // issued
in->inode.size, in->inode.mtime, in->inode.atime);
filepath path;
in->make_path(path);
if (in->caps.empty())
return; // guard if at end of func
- map<int,InodeCap>::iterator next;
- for (map<int,InodeCap>::iterator it = in->caps.begin();
+ map<int,InodeCap*>::iterator next;
+ for (map<int,InodeCap*>::iterator it = in->caps.begin();
it != in->caps.end();
it = next) {
next = it;
next++;
- InodeCap &cap = it->second;
- int revoking = cap.implemented & ~cap.issued;
+ InodeCap *cap = it->second;
+ int revoking = cap->implemented & ~cap->issued;
if (in->wanted_max_size > in->inode.max_size &&
in->wanted_max_size > in->requested_max_size)
/* completed revocation? */
if (revoking && (revoking && used) == 0) {
- dout(10) << "completed revocation of " << (cap.implemented & ~cap.issued) << dendl;
+ dout(10) << "completed revocation of " << (cap->implemented & ~cap->issued) << dendl;
goto ack;
}
/* approaching file_max? */
- if ((cap.issued & CEPH_CAP_WR) &&
+ if ((cap->issued & CEPH_CAP_WR) &&
(in->inode.size << 1) >= in->inode.max_size &&
(in->reported_size << 1) < in->inode.max_size) {
dout(10) << "size approaching max_size" << dendl;
goto ack;
}
- if ((cap.issued & ~wanted) == 0)
+ if ((cap->issued & ~wanted) == 0)
continue; /* nothing extra, all good */
/*
ack:
MClientFileCaps *m = new MClientFileCaps(CEPH_CAP_OP_ACK,
in->inode,
- it->second.seq,
- it->second.issued,
+ 0,
+ cap->seq,
+ cap->issued,
wanted,
0);
in->reported_size = in->inode.size;
* handle caps update from mds. including mds to mds caps transitions.
* do not block.
*/
-void Client::add_update_inode_cap(Inode *in, int mds, unsigned issued, unsigned seq, unsigned mseq)
+void Client::add_update_inode_cap(Inode *in, int mds,
+ inodeno_t realm, vector<snapid_t> &snaps,
+ unsigned issued, unsigned seq, unsigned mseq)
{
- if (!in->caps.count(mds)) {
+ InodeCap *cap = 0;
+ if (in->caps.count(mds)) {
+ cap = in->caps[mds];
+ } else {
mds_sessions[mds].num_caps++;
if (in->caps.empty())
in->get();
in->exporting_issued = 0;
in->exporting_mseq = 0;
}
+ in->caps[mds] = cap = new InodeCap(get_cap_realm(realm, snaps));
}
- unsigned old_caps = in->caps[mds].issued;
- InodeCap &cap = in->caps[mds];
- cap.issued |= issued;
- cap.implemented |= issued;
- cap.seq = seq;
- cap.mseq = mseq;
+ unsigned old_caps = cap->issued;
+ cap->issued |= issued;
+ cap->implemented |= issued;
+ cap->seq = seq;
+ cap->mseq = mseq;
if (issued & ~old_caps)
signal_cond_list(in->waitfor_caps);
void Client::remove_cap(Inode *in, int mds)
{
assert(in->caps.count(mds));
+ InodeCap *cap = in->caps[mds];
+ cap->realm_cap_item.remove_myself();
+ if (cap->realm->caps.empty())
+ remove_cap_realm(cap->realm);
in->caps.erase(mds);
if (in->caps.empty())
put_inode(in);
if (m->get_op() == CEPH_CAP_OP_IMPORT) {
// add/update it
- add_update_inode_cap(in, mds, m->get_caps(), m->get_seq(), m->get_mseq());
+ add_update_inode_cap(in, mds, m->get_realm(), m->get_snaps(), m->get_caps(), m->get_seq(), m->get_mseq());
if (in->exporting_mseq < m->get_mseq()) {
dout(5) << "handle_file_caps ino " << m->get_ino() << " mseq " << m->get_mseq()
// note?
bool found_higher_mseq = false;
InodeCap *cap = 0;
- for (map<int,InodeCap>::iterator p = in->caps.begin();
+ for (map<int,InodeCap*>::iterator p = in->caps.begin();
p != in->caps.end();
p++) {
if (p->first == mds) {
- cap = &p->second;
+ cap = p->second;
continue;
}
- if (p->second.mseq > m->get_mseq()) {
+ if (p->second->mseq > m->get_mseq()) {
found_higher_mseq = true;
dout(5) << "handle_file_caps ino " << m->get_ino() << " mseq " << m->get_mseq()
<< " EXPORT from mds" << mds
- << ", but mds" << p->first << " has higher mseq " << p->second.mseq << dendl;
+ << ", but mds" << p->first << " has higher mseq " << p->second->mseq << dendl;
break;
}
}
}
// ok!
- InodeCap &cap = in->caps[mds];
+ InodeCap *cap = in->caps[mds];
// truncate?
return;
}
- cap.seq = m->get_seq();
+ cap->seq = m->get_seq();
// don't want it?
int wanted = in->caps_wanted();
int used = in->caps_used();
// update per-mds caps
- const int old_caps = cap.issued;
+ const int old_caps = cap->issued;
const int new_caps = m->get_caps();
dout(5) << "handle_file_caps on in " << m->get_ino()
<< " mds" << mds << " seq " << m->get_seq()
if (old_caps & ~new_caps) {
dout(10) << " revocation of " << cap_string(~new_caps & old_caps) << dendl;
- cap.issued = new_caps;
+ cap->issued = new_caps;
- if ((cap.issued & ~new_caps) & CEPH_CAP_RDCACHE)
+ if ((cap->issued & ~new_caps) & CEPH_CAP_RDCACHE)
_release(in, false);
if ((used & ~new_caps) & CEPH_CAP_WRBUFFER)
_flush(in, false);
else {
ack = true;
- cap.implemented = new_caps;
+ cap->implemented = new_caps;
// share our (possibly newer) file size, mtime, atime
m->set_size(in->inode.size);
dout(10) << " caps unchanged at " << cap_string(old_caps) << dendl;
} else {
dout(10) << " grant, new caps are " << cap_string(new_caps & ~old_caps) << dendl;
- cap.issued = cap.implemented = new_caps;
+ cap->issued = cap->implemented = new_caps;
}
// wake up waiters
// add the cap
int mds = reply->get_source().num();
add_update_inode_cap(in, mds,
+ reply->get_file_caps_realm(),
+ reply->get_snaps(),
reply->get_file_caps(),
reply->get_file_caps_seq(),
reply->get_file_caps_mseq());
};
-class InodeCap {
- public:
+struct InodeCap;
+
+struct CapRealm {
+ inodeno_t dirino;
+ vector<snapid_t> snaps;
+ xlist<InodeCap*> caps;
+
+ CapRealm(inodeno_t i, vector<snapid_t> &s) : dirino(i) {
+ snaps.swap(s);
+ }
+};
+
+struct InodeCap {
unsigned issued;
unsigned implemented;
__u64 seq;
__u32 mseq; // migration seq
- InodeCap() : issued(0), implemented(0), seq(0), mseq(0) {}
+ CapRealm *realm;
+ xlist<InodeCap*>::item realm_cap_item;
+
+ InodeCap(CapRealm *r) : issued(0), implemented(0), seq(0), mseq(0),
+ realm(r), realm_cap_item(this) {
+ realm->caps.push_back(&realm_cap_item);
+ }
};
+
class Inode {
public:
inode_t inode; // the actual inode
bool dir_hashed, dir_replicated;
// per-mds caps
- map<int,InodeCap> caps; // mds -> InodeCap
+ map<int,InodeCap*> caps; // mds -> InodeCap
unsigned exporting_issued;
int exporting_mds;
capseq_t exporting_mseq;
int caps_issued() {
int c = exporting_issued;
- for (map<int,InodeCap>::iterator it = caps.begin();
+ for (map<int,InodeCap*>::iterator it = caps.begin();
it != caps.end();
it++)
- c |= it->second.issued;
+ c |= it->second->issued;
return c;
}
Inode* root;
LRU lru; // lru list of Dentry's in our local metadata cache.
+ map<inodeno_t,CapRealm*> cap_realms;
+
+ CapRealm *get_cap_realm(inodeno_t r, vector<snapid_t> &snaps) {
+ if (cap_realms.count(r))
+ return cap_realms[r];
+ CapRealm *realm = new CapRealm(r, snaps);
+ cap_realms[r] = realm;
+ return realm;
+ }
+ void remove_cap_realm(CapRealm *realm) {
+ assert(realm->caps.empty());
+ cap_realms.erase(realm->dirino);
+ }
+
// file handles, etc.
filepath cwd;
interval_set<int> free_fd_set; // unused fds
void release_lease(Inode *in, Dentry *dn, int mask);
// file caps
- void add_update_inode_cap(Inode *in, int mds, unsigned issued, unsigned seq, unsigned mseq);
+ void add_update_inode_cap(Inode *in, int mds,
+ inodeno_t realm, vector<snapid_t> &snaps,
+ unsigned issued, unsigned seq, unsigned mseq);
void remove_cap(Inode *in, int mds);
void handle_file_caps(class MClientFileCaps *m);
void check_caps(Inode *in);
__le32 op;
__le32 result;
__le32 file_caps;
+ __le64 file_caps_realm;
__le32 file_caps_seq;
__le32 file_caps_mseq;
__le32 mdsmap_epoch;
+ __le32 num_snaps;
+ __le64 snaps[];
} __attribute__ ((packed));
/*
struct ceph_frag_tree_head {
__le32 nsplits;
- struct ceph_frag_tree_split splits[0];
+ struct ceph_frag_tree_split splits[];
} __attribute__ ((packed));
struct ceph_mds_reply_inode {
__le32 migrate_seq;
struct ceph_timespec mtime, atime, ctime;
__le64 time_warp_seq;
+ __le32 num_snaps;
+ __le64 snaps[];
} __attribute__ ((packed));
__le32 shed_count;
struct ceph_osd_peer_stat peer_stat;
- __le32 num_snap;
- __le64 snap[];
+ __le32 num_snaps;
+ __le64 snaps[];
} __attribute__ ((packed));
struct ceph_osd_reply_head {
}
}
+/*
+ * note: this is _not_ inclusive of *this->snaprealm, as that is for
+ * nested directory content.
+ */
+SnapRealm *CInode::find_containing_snaprealm()
+{
+ CInode *cur = this;
+ while (1) {
+ if (!cur->get_parent_dn())
+ return 0;
+ cur = cur->get_parent_dn()->get_dir()->get_inode();
+ if (cur->snaprealm)
+ return snaprealm;
+ }
+}
+
void CInode::encode_snap(bufferlist &bl)
{
bufferlist snapbl;
#include "ScatterLock.h"
#include "LocalLock.h"
#include "Capability.h"
-
+#include "snap.h"
#include <cassert>
#include <list>
// -- snap --
void open_snaprealm();
void close_snaprealm();
+ SnapRealm *find_containing_snaprealm();
void encode_snap(bufferlist &bl);
void decode_snap(bufferlist::iterator& p) {
bufferlist snapbl;
assert(client_caps.count(client) == 0);
Capability *cap = client_caps[client] = new Capability;
cap->set_inode(in);
+
+ SnapRealm *realm = find_containing_snaprealm();
+ realm->add_cap(client, cap);
+
return cap;
}
void remove_client_cap(int client) {
private:
CInode *inode;
- xlist<Capability*>::item cap_group_item;
__u32 wanted_caps; // what the client wants (ideally)
map<capseq_t, __u32> cap_history; // seq -> cap, [last_recv,last_sent]
bool stale;
public:
xlist<Capability*>::item session_caps_item;
+ xlist<Capability*>::item snaprealm_caps_item;
Capability(CInode *i=0, int want=0, capseq_t s=0) :
- inode(i), cap_group_item(this),
+ inode(i),
wanted_caps(want),
last_sent(s),
last_recv(s),
last_open(0),
mseq(0),
suppress(false), stale(false),
- session_caps_item(this) {
- }
+ session_caps_item(this), snaprealm_caps_item(this) { }
capseq_t get_mseq() { return mseq; }
<< dendl;
mds->send_message_client(new MClientFileCaps(CEPH_CAP_OP_GRANT,
in->inode,
+ in->find_containing_snaprealm()->inode->ino(),
cap->get_last_seq(),
cap->pending(),
cap->wanted(),
Capability *cap = it->second;
mds->send_message_client(new MClientFileCaps(CEPH_CAP_OP_TRUNC,
in->inode,
+ in->find_containing_snaprealm()->inode->ino(),
cap->get_last_seq(),
cap->pending(),
cap->wanted(),
dout(10) << "share_inode_max_size with client" << client << dendl;
mds->send_message_client(new MClientFileCaps(CEPH_CAP_OP_GRANT,
in->inode,
+ in->find_containing_snaprealm()->inode->ino(),
cap->get_last_seq(),
cap->pending(),
cap->wanted(),
// send IMPORT
MClientFileCaps *reap = new MClientFileCaps(CEPH_CAP_OP_IMPORT,
in->inode,
+ in->find_containing_snaprealm()->inode->ino(),
cap->get_last_seq(),
cap->pending(),
cap->wanted(),
cap->get_mseq());
+ in->find_containing_snaprealm()->get_snap_vector(reap->get_snaps());
mds->messenger->send_message(reap, session->inst);
}
<< " exported caps on " << *in << dendl;
MClientFileCaps *m = new MClientFileCaps(CEPH_CAP_OP_EXPORT,
in->inode,
+ in->find_containing_snaprealm()->inode->ino(),
cap->get_last_seq(),
cap->pending(),
cap->wanted(),
MClientFileCaps *caps = new MClientFileCaps(CEPH_CAP_OP_IMPORT,
in->inode,
+ in->find_containing_snaprealm()->inode->ino(),
cap->get_last_seq(),
cap->pending(),
cap->wanted(),
cap->get_mseq());
+ in->find_containing_snaprealm()->get_snap_vector(caps->get_snaps());
mds->send_message_client(caps, session->inst);
}
MClientFileCaps *stale = new MClientFileCaps(CEPH_CAP_OP_EXPORT,
fake_inode,
0,
+ 0,
0, // doesn't matter.
p->second.wanted, // doesn't matter.
0); // FIXME get proper mseq here? hmm.
reply->set_file_caps(cap->pending());
reply->set_file_caps_seq(cap->get_last_seq());
reply->set_file_caps_mseq(cap->get_mseq());
+ cur->find_containing_snaprealm()->get_snap_vector(reply->get_snaps());
//reply->set_file_data_version(fdv);
reply_request(mdr, reply);
* get list of snaps for this realm. we must include parents' snaps
* for the intervals during which they were our parent.
*/
-void SnapRealm::get_snap_list(set<snapid_t> &s)
+void SnapRealm::get_snap_set(set<snapid_t> &s)
{
// start with my snaps
for (map<snapid_t, SnapInfo>::iterator p = snaps.begin();
}
dout(10) << "build_snap_list " << s << dendl;
}
+
+/*
+ * build vector in reverse sorted order
+ */
+void SnapRealm::get_snap_vector(vector<snapid_t> &v)
+{
+ set<snapid_t> s;
+ get_snap_set(s);
+ v.resize(s.size());
+ int i = 0;
+ for (set<snapid_t>::reverse_iterator p = s.rbegin(); p != s.rend(); p++)
+ v[i++] = *p;
+}
class MDCache;
class MDRequest;
+
+/*
+ * CapabilityGroup - group per-realm, per-client caps for efficient
+ * client snap notifications.
+ */
+#include "Capability.h"
+
struct snaplink_t {
inodeno_t dirino;
snapid_t first;
//set<snapid_t> cached_snaps;
//set<SnapRealm*> cached_active_children; // active children that are currently open
- xlist<CInode*> inodes_with_caps; // for efficient realm splits
- map<int, CapabilityGroup*> client_cap_groups; // to identify clients who need snap notifications
+ xlist<CInode*> inodes_with_caps; // for efficient realm splits
+ map<int, xlist<Capability*> > client_caps; // to identify clients who need snap notifications
SnapRealm(MDCache *c, CInode *in) : mdcache(c), inode(in) {}
bool open_parents(MDRequest *mdr);
- void get_snap_list(set<snapid_t>& s);
+ void get_snap_set(set<snapid_t>& s);
+ void get_snap_vector(vector<snapid_t>& s);
+ void add_cap(int client, Capability *cap) {
+ client_caps[client].push_back(&cap->snaprealm_caps_item);
+ }
};
WRITE_CLASS_ENCODER(SnapRealm)
-/*
- * CapabilityGroup - group per-realm, per-client caps for efficient
- * client snap notifications.
- */
-struct Capability;
-
-struct CapabilityGroup {
- int client;
- xlist<Capability*> caps;
- SnapRealm *realm;
-};
#endif
private:
struct ceph_mds_file_caps h;
+ vector<snapid_t> snaps;
public:
int get_caps() { return h.caps; }
int get_wanted() { return h.wanted; }
capseq_t get_seq() { return h.seq; }
capseq_t get_mseq() { return h.migrate_seq; }
+ vector<snapid_t> &get_snaps() { return snaps; }
inodeno_t get_ino() { return inodeno_t(h.ino); }
+ inodeno_t get_realm() { return inodeno_t(h.realm); }
__u64 get_size() { return h.size; }
__u64 get_max_size() { return h.max_size; }
utime_t get_ctime() { return utime_t(h.ctime); }
MClientFileCaps() {}
MClientFileCaps(int op,
inode_t& inode,
+ inodeno_t realm,
long seq,
int caps,
int wanted,
h.caps = caps;
h.wanted = wanted;
h.ino = inode.ino;
+ h.realm = realm;
h.size = inode.size;
h.max_size = inode.max_size;
h.migrate_seq = mseq;
void decode_payload() {
bufferlist::iterator p = payload.begin();
::decode(h, p);
+ snaps.resize(h.num_snaps);
+ for (unsigned i=0; i<snaps.size(); i++)
+ ::decode(snaps[i], p);
}
void encode_payload() {
+ h.num_snaps = snaps.size();
::encode(h, payload);
+ for (unsigned i=0; i<snaps.size(); i++)
+ ::encode(snaps[i], payload);
}
};
class MClientReply : public Message {
// reply data
struct ceph_mds_reply_head st;
+ vector<snapid_t> snaps;
bufferlist trace_bl;
bufferlist dir_bl;
int get_result() { return (__s32)(__u32)st.result; }
+ vector<snapid_t> &get_snaps() { return snaps; }
+
unsigned get_file_caps() { return st.file_caps; }
+ inodeno_t get_file_caps_realm() { return inodeno_t((__u64)st.file_caps_realm); }
unsigned get_file_caps_seq() { return st.file_caps_seq; }
unsigned get_file_caps_mseq() { return st.file_caps_mseq; }
//uint64_t get_file_data_version() { return st.file_data_version; }
virtual void decode_payload() {
bufferlist::iterator p = payload.begin();
::decode(st, p);
+ snaps.resize(st.num_snaps);
+ for (unsigned i=0; i<snaps.size(); i++)
+ ::decode(snaps[i], p);
::decode(trace_bl, p);
::decode(dir_bl, p);
assert(p.end());
}
virtual void encode_payload() {
+ st.num_snaps = snaps.size();
::encode(st, payload);
+ for (unsigned i=0; i<snaps.size(); i++)
+ ::encode(snaps[i], payload);
::encode(trace_bl, payload);
::encode(dir_bl, payload);
}
private:
ceph_osd_request_head head;
- vector<snapid_t> snap;
+ vector<snapid_t> snaps;
friend class MOSDOpReply;
public:
- vector<snapid_t> &get_snap() { return snap; }
+ vector<snapid_t> &get_snaps() { return snaps; }
osd_reqid_t get_reqid() { return osd_reqid_t(head.client_inst.name,
head.client_inc,
// marshalling
virtual void encode_payload() {
- head.num_snap = snap.size();
+ head.num_snaps = snaps.size();
::encode(head, payload);
- for (unsigned i=0; i<snap.size(); i++)
- ::encode(snap[i], payload);
+ for (unsigned i=0; i<snaps.size(); i++)
+ ::encode(snaps[i], payload);
env.data_off = get_offset();
}
virtual void decode_payload() {
bufferlist::iterator p = payload.begin();
::decode(head, p);
- snap.resize(head.num_snap);
- for (unsigned i=0; i<snap.size(); i++)
- ::decode(snap[i], p);
+ snaps.resize(head.num_snaps);
+ for (unsigned i=0; i<snaps.size(); i++)
+ ::decode(snaps[i], p);
}
if (get_length()) out << " " << get_offset() << "~" << get_length();
out << " " << pg_t(head.layout.ol_pgid);
if (is_retry_attempt()) out << " RETRY";
- if (!snap.empty())
- out << " snap=" << snap;
+ if (!snaps.empty())
+ out << " snaps=" << snaps;
out << ")";
}
};
// snap read. hrm.
// are we missing a revision that we might need?
// let's get them all.
- for (unsigned i=0; i<op->get_snap().size(); i++) {
+ for (unsigned i=0; i<op->get_snaps().size(); i++) {
object_t oid = op->get_oid();
- oid.snap = op->get_snap()[i];
+ oid.snap = op->get_snaps()[i];
if (pg->is_missing_object(oid)) {
dout(10) << "handle_op _may_ need missing rev " << oid << ", pulling" << dendl;
pg->wait_for_missing_object(op->get_oid(), op);
long r = 0;
// do it.
- if (poid.oid.snap && !pick_object_rev(poid, op->get_snap())) {
+ if (poid.oid.snap && !pick_object_rev(poid, op->get_snaps())) {
// we have no revision for this request.
r = -EEXIST;
goto done;
dout(10) << "op_modify " << opname
<< " " << poid.oid
<< " av " << av
- << " snaps=" << op->get_snap()
+ << " snaps=" << op->get_snaps()
<< " follows_snap " << follows
<< " " << op->get_offset() << "~" << op->get_length()
<< dendl;
prepare_transaction(repop->t, op->get_reqid(),
poid, op->get_op(), av,
op->get_offset(), op->get_length(), op->get_data(),
- follows, op->get_snap(),
+ follows, op->get_snaps(),
op->get_inc_lock(), peers_complete_thru);
}