snapshot notes --
todo
-/- basic types (snapid_t, etc.)
-/- snap lineage in MOSDOp
- rados bits to do clone+write
-/ - cloning
- fix cloning on unlinked file (where snaps=[], but head may have follows_snap attr)
- - make sense of snap_highwater...
-
- figure out how to fix up rados logging
- snap collections
- garbage collection
-- mds types
-- client capgroups
-- mds snapid allocation
+- realms
+ - make better sense of snap_highwater...?
- snap creation
- - async SnapClient for the (possibly remote) SnapTable
+ - enforce name uniqueness?
+ - async SnapClient for the possibly remote SnapTable
- hmm, can we generalize any of AnchorClient?
-- mds metadata versioning
-- mds server ops
-
-- base types
-
-typedef __u64 snapid_t;
-#define MAXSNAP (snapid_t)(0xffffffffffffffull) /* 56 bits.. see ceph_pg */
-#define NOSNAP (snapid_t)(-1)
-
-- let's go with [first, last] throughout, instead of non-inclusive drev...
-
-
-
-mds
-- break mds hierarchy into snaprealms
- - keep per-realm inode xlists, so that breaking a realm is O(size(realm))
-struct Snap {
- snapid_t snapid;
- string name;
- utime_t ctime;
-};
-
-struct snaplink_t {
- snaprealm *realm;
- snapid_t first;
-};
-struct SnapRealm {
- inodeno_t dirino;
- map<snapid_t, Snap> snaps;
-
- int nlink;
- multimap<snapid_t, snaplink_t> parents; // key is "last" (or NOSNAP)
- multimap<snapid_t, snaplink_t> children;
-
- xlist<CInode*> inodes_with_caps; // used for efficient realm splits
-};
-- realm's parent can vary over time; we need to track the full history, so that we know which parents' snaps to include in the snap lineage.
+- mds metadata versioning
+ - (dir) inode versions..
-- link client caps to realm, so that snapshot creation is O(num_child_realms*num_clients)
- - keep per-realm, per-client record with cap refcount, to avoid traversinng realm inode lists looking for caps
+- will snapshots and CAS play nice?
-struct CapabilityGroup {
- int client;
- xlist<Capability*> caps;
- SnapRealm *realm;
-};
-in SnapRealm,
- map<int, CapabilityGroup*> client_cap_groups; // used to identify clients who need snap notifications
+- mds server ops
- when we create a snapshot,
- xlock snaplock
void CInode::open_snaprealm()
{
if (!snaprealm) {
+ SnapRealm *parent = find_snaprealm();
snaprealm = new SnapRealm(mdcache, this);
-
- snaprealm->open_parent = find_containing_snaprealm();
- if (snaprealm->open_parent) {
- snaprealm->open_parent->open_children.insert(snaprealm);
+ if (parent) {
+ snaprealm->parent = parent;
+ parent->open_children.insert(snaprealm);
dout(10) << " opened snaprealm " << snaprealm
- << " parent is " << snaprealm->open_parent
- << " siblings are " << snaprealm->open_parent->open_children
+ << " parent is " << parent
+ << " siblings are " << parent->open_children
<< dendl;
}
}
void CInode::close_snaprealm()
{
if (snaprealm) {
- if (snaprealm->open_parent)
- snaprealm->open_parent->open_children.erase(snaprealm);
+ if (snaprealm->parent)
+ snaprealm->parent->open_children.erase(snaprealm);
delete snaprealm;
snaprealm = 0;
}
* note: this is _not_ inclusive of *this->snaprealm, as that is for
* nested directory content.
*/
-SnapRealm *CInode::find_containing_snaprealm()
+SnapRealm *CInode::find_snaprealm()
{
CInode *cur = this;
- while (cur->get_parent_dn()) {
+ while (cur->get_parent_dn() && !cur->snaprealm)
cur = cur->get_parent_dn()->get_dir()->get_inode();
- if (cur->snaprealm)
- return cur->snaprealm;
- }
- return 0;
+ return cur->snaprealm;
}
void CInode::encode_snap(bufferlist &bl)
// -- snap --
void open_snaprealm();
void close_snaprealm();
- SnapRealm *find_containing_snaprealm();
+ SnapRealm *find_snaprealm();
void encode_snap(bufferlist &bl);
void decode_snap(bufferlist::iterator& p) {
bufferlist snapbl;
Capability *add_client_cap(int client, CInode *in) {
if (client_caps.empty()) {
get(PIN_CAPS);
- containing_realm = find_containing_snaprealm();
+ containing_realm = find_snaprealm();
containing_realm->inodes_with_caps.push_back(&xlist_caps);
}
<< dendl;
mds->send_message_client(new MClientFileCaps(CEPH_CAP_OP_GRANT,
in->inode,
- in->find_containing_snaprealm()->inode->ino(),
+ in->find_snaprealm()->inode->ino(),
cap->get_last_seq(),
cap->pending(),
cap->wanted(),
Capability *cap = it->second;
mds->send_message_client(new MClientFileCaps(CEPH_CAP_OP_TRUNC,
in->inode,
- in->find_containing_snaprealm()->inode->ino(),
+ in->find_snaprealm()->inode->ino(),
cap->get_last_seq(),
cap->pending(),
cap->wanted(),
dout(10) << "share_inode_max_size with client" << client << dendl;
mds->send_message_client(new MClientFileCaps(CEPH_CAP_OP_GRANT,
in->inode,
- in->find_containing_snaprealm()->inode->ino(),
+ in->find_snaprealm()->inode->ino(),
cap->get_last_seq(),
cap->pending(),
cap->wanted(),
session->touch_cap(cap);
// send IMPORT
- SnapRealm *realm = in->find_containing_snaprealm();
+ SnapRealm *realm = in->find_snaprealm();
MClientFileCaps *reap = new MClientFileCaps(CEPH_CAP_OP_IMPORT,
in->inode,
realm->inode->ino(),
CInode *in = *file_recover_queue.begin();
file_recover_queue.erase(in);
- vector<snapid_t> *snaps = in->find_containing_snaprealm()->get_snap_vector();
+ vector<snapid_t> *snaps = in->find_snaprealm()->get_snap_vector();
if (in->inode.max_size > in->inode.size) {
dout(10) << "do_file_recover starting " << in->inode.size << "/" << in->inode.max_size
<< " exported caps on " << *in << dendl;
MClientFileCaps *m = new MClientFileCaps(CEPH_CAP_OP_EXPORT,
in->inode,
- in->find_containing_snaprealm()->inode->ino(),
+ in->find_snaprealm()->inode->ino(),
cap->get_last_seq(),
cap->pending(),
cap->wanted(),
}
cap->merge(it->second);
- SnapRealm *realm = in->find_containing_snaprealm();
+ SnapRealm *realm = in->find_snaprealm();
MClientFileCaps *caps = new MClientFileCaps(CEPH_CAP_OP_IMPORT,
in->inode,
realm->inode->ino(),
reply->set_file_caps_seq(cap->get_last_seq());
reply->set_file_caps_mseq(cap->get_mseq());
- SnapRealm *realm = cur->find_containing_snaprealm();
+ SnapRealm *realm = cur->find_snaprealm();
reply->get_snaps() = *realm->get_snap_vector();
reply->set_snap_info(realm->inode->ino(), realm->created, realm->snap_highwater);
dout(10) << " snaprealm is " << *realm << " snaps=" << reply->get_snaps() << " on " << *realm->inode << dendl;
snapid_t snapid = mds->snaptable->create(diri->ino(), req->get_path2(), mdr->now);
dout(10) << " snapid is " << snapid << dendl;
+
+ // GO.
+
// create realm?
inodeno_t split_parent = 0;
if (!diri->snaprealm) {
diri->open_snaprealm();
diri->snaprealm->created = snapid;
- // link them up
- // HACK! parent may be on another mds...
-
- SnapRealm *parent = diri->snaprealm->open_parent;
+ // split existing caps
+ SnapRealm *parent = diri->snaprealm->parent;
assert(parent);
assert(parent->open_children.count(diri->snaprealm));
- snaplink_t link;
- link.first = 0;
- link.dirino = parent->inode->ino();
- diri->snaprealm->parents.insert(pair<snapid_t,snaplink_t>(CEPH_NOSNAP, link));
-
- // split existing caps
parent->split_at(diri->snaprealm);
split_parent = parent->inode->ino();
}
bool SnapRealm::open_parents(MDRequest *mdr)
{
dout(10) << "open_parents" << dendl;
- for (multimap<snapid_t, snaplink_t>::iterator p = parents.begin();
- p != parents.end();
- p++) {
+
+ // make sure my current parents' parents are open...
+ if (parent) {
+ dout(10) << " parent is " << *parent
+ << " on " << *parent->inode << dendl;
+ if (!parent->open_parents(mdr))
+ return false;
+ }
+
+ // and my past parents too!
+ for (map<snapid_t, snaplink_t>::iterator p = past_parents.begin();
+ p != past_parents.end();
+ p++) {
CInode *parent = mdcache->get_inode(p->second.dirino);
if (parent)
continue;
s.insert(p->first);
// include snaps for parents during intervals that intersect [first,last]
- for (multimap<snapid_t, snaplink_t>::iterator p = parents.lower_bound(first);
- p != parents.end() && p->first >= first && p->second.first <= last;
+ snapid_t thru = first;
+ for (map<snapid_t, snaplink_t>::iterator p = past_parents.lower_bound(first);
+ p != past_parents.end() && p->first >= first && p->second.first <= last;
p++) {
- CInode *parent = mdcache->get_inode(p->second.dirino);
- assert(parent); // call open_parents first!
- assert(parent->snaprealm);
-
- parent->snaprealm->get_snap_set(s,
- MAX(first, p->second.first),
- MIN(last, p->first));
+ CInode *oldparent = mdcache->get_inode(p->second.dirino);
+ assert(oldparent); // call open_parents first!
+ assert(oldparent->snaprealm);
+
+ thru = MIN(last, p->first);
+ oldparent->snaprealm->get_snap_set(s,
+ MAX(first, p->second.first),
+ thru);
+ thru++;
}
+ if (thru <= last && parent)
+ parent->get_snap_set(s, thru, last);
}
/*
return get_snap_vector();
}
snap_highwater = creating;
- cached_snaps.push_back(creating);
+ cached_snaps.insert(cached_snaps.begin(), creating); // FIXME.. we should store this in reverse!
return &cached_snaps;
}
dout(10) << "split_at " << *child
<< " on " << *child->inode << dendl;
- // split children
+ // split open_children
dout(10) << " my children are " << open_children << dendl;
for (set<SnapRealm*>::iterator p = open_children.begin();
p != open_children.end(); ) {
if (realm != child &&
child->inode->is_ancestor_of(realm->inode)) {
dout(20) << " child gets child realm " << *realm << " on " << *realm->inode << dendl;
- realm->open_parent = child;
+ realm->parent = child;
child->open_children.insert(realm);
open_children.erase(p++);
} else {
// realm state
snapid_t created;
map<snapid_t, SnapInfo> snaps;
- multimap<snapid_t, snaplink_t> parents; // key is "last" (or NOSNAP)
+ map<snapid_t, snaplink_t> past_parents; // key is "last" (or NOSNAP)
void encode(bufferlist& bl) const {
::encode(created, bl);
::encode(snaps, bl);
- ::encode(parents, bl);
+ ::encode(past_parents, bl);
}
void decode(bufferlist::iterator& p) {
::decode(created, p);
::decode(snaps, p);
- ::decode(parents, p);
+ ::decode(past_parents, p);
}
// in-memory state
MDCache *mdcache;
CInode *inode;
- // caches?
- SnapRealm *open_parent;
+ SnapRealm *parent;
set<SnapRealm*> open_children; // active children that are currently open
+ // caches?
vector<snapid_t> cached_snaps;
snapid_t snap_highwater;
SnapRealm(MDCache *c, CInode *in) :
created(0),
mdcache(c), inode(in),
- open_parent(0),
+ parent(0),
snap_highwater(0)
{ }
inline ostream& operator<<(ostream& out, const SnapRealm &realm) {
out << "snaprealm(" << realm.snaps;
- if (realm.parents.size()) {
- out << " parents=(";
- for (multimap<snapid_t, snaplink_t>::const_iterator p = realm.parents.begin();
- p != realm.parents.end();
+ if (realm.past_parents.size()) {
+ out << " past_parents=(";
+ for (map<snapid_t, snaplink_t>::const_iterator p = realm.past_parents.begin();
+ p != realm.past_parents.end();
p++) {
- if (p != realm.parents.begin()) out << ",";
- out << p->second.first << "-";
- if (p->first == CEPH_NOSNAP)
- out << "head";
- else
- out << p->first;
- out << "=" << p->second.dirino;
+ if (p != realm.past_parents.begin()) out << ",";
+ out << p->second.first << "-" << p->first
+ << "=" << p->second.dirino;
}
out << ")";
}