From 3d4e4ac2586ad64ec7284b741dc13d8b479b9c52 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Wed, 2 Jul 2008 13:30:24 -0700 Subject: [PATCH] mds: SnapRealm --- src/Makefile.am | 1 + src/TODO | 4 ++- src/include/ceph_fs.h | 1 + src/mds/CInode.h | 8 +++-- src/mds/Capability.h | 5 +-- src/mds/SnapTable.cc | 2 +- src/mds/journal.cc | 2 +- src/mds/snap.cc | 72 +++++++++++++++++++++++++++++++++++++++++++ src/mds/snap.h | 71 +++++++++++++++++++++++++++++++++++++++--- 9 files changed, 155 insertions(+), 11 deletions(-) create mode 100644 src/mds/snap.cc diff --git a/src/Makefile.am b/src/Makefile.am index d616484463be9..d05cf1b2fa9b3 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -206,6 +206,7 @@ libmds_a_SOURCES = \ mds/MDSTable.cc \ mds/IdAllocator.cc \ mds/SnapTable.cc \ + mds/snap.cc \ mds/SessionMap.cc \ mds/MDLog.cc diff --git a/src/TODO b/src/TODO index ea9570e031425..0f370a97f3706 100644 --- a/src/TODO +++ b/src/TODO @@ -229,6 +229,8 @@ todo - client capgroups - mds snapid allocation - snap creation + - async SnapClient for the (possibly remote) SnapTable + - hmm, can we generalize any of AnchorClient? - mds metadata versioning - mds server ops @@ -281,7 +283,7 @@ in SnapRealm, - when we create a snapshot, - xlock snaplock - - create realm, if necesarry + - create realm, if necessary - add it to the realm snaps list. - build list of current children - send client a capgroup update for each affected realm diff --git a/src/include/ceph_fs.h b/src/include/ceph_fs.h index 4c547631c4068..71ae52da3ac43 100644 --- a/src/include/ceph_fs.h +++ b/src/include/ceph_fs.h @@ -780,6 +780,7 @@ struct ceph_mds_file_caps { __le32 seq; __le32 caps, wanted; __le64 ino; + __le64 realm; __le64 size, max_size; __le32 migrate_seq; struct ceph_timespec mtime, atime, ctime; diff --git a/src/mds/CInode.h b/src/mds/CInode.h index 8a6008bce9b56..528a282ad5dcd 100644 --- a/src/mds/CInode.h +++ b/src/mds/CInode.h @@ -47,6 +47,7 @@ class CInode; class CInodeDiscover; class MDCache; class LogSegment; +class SnapRealm; ostream& operator<<(ostream& out, CInode& in); @@ -132,6 +133,7 @@ class CInode : public MDSCacheObject { string symlink; // symlink dest, if symlink map xattrs; fragtree_t dirfragtree; // dir frag tree, if any. always consistent with our dirfrag map. + SnapRealm *snaprealm; off_t last_journaled; // log offset for the last time i was journaled off_t last_open_journaled; // log offset for the last journaled EOpen @@ -211,7 +213,7 @@ public: // -- distributed state -- protected: // file capabilities - map client_caps; // client -> caps + map client_caps; // client -> caps map mds_caps_wanted; // [auth] mds -> caps wanted int replica_caps_wanted; // [replica] what i've requested from auth utime_t replica_caps_wanted_keep_until; @@ -220,6 +222,7 @@ protected: // LogSegment xlists i (may) belong to xlist::item xlist_dirty; public: + xlist::item xlist_caps; xlist::item xlist_open_file; xlist::item xlist_dirty_dirfrag_dir; xlist::item xlist_dirty_dirfrag_dirfragtree; @@ -254,13 +257,14 @@ private: // --------------------------- CInode(MDCache *c, bool auth=true) : mdcache(c), + snaprealm(0), last_journaled(0), last_open_journaled(0), //hack_accessed(true), stickydir_ref(0), parent(0), projected_parent(0), inode_auth(CDIR_AUTH_DEFAULT), replica_caps_wanted(0), - xlist_dirty(this), xlist_open_file(this), + xlist_dirty(this), xlist_caps(this), xlist_open_file(this), xlist_dirty_dirfrag_dir(this), xlist_dirty_dirfrag_dirfragtree(this), xlist_purging_inode(this), diff --git a/src/mds/Capability.h b/src/mds/Capability.h index 6fb52ca2bee22..911a9d7c25252 100644 --- a/src/mds/Capability.h +++ b/src/mds/Capability.h @@ -73,8 +73,9 @@ public: private: CInode *inode; + xlist::item cap_group_item; __u32 wanted_caps; // what the client wants (ideally) - + map cap_history; // seq -> cap, [last_recv,last_sent] capseq_t last_sent, last_recv; capseq_t last_open; @@ -86,7 +87,7 @@ public: xlist::item session_caps_item; Capability(CInode *i=0, int want=0, capseq_t s=0) : - inode(i), + inode(i), cap_group_item(this), wanted_caps(want), last_sent(s), last_recv(s), diff --git a/src/mds/SnapTable.cc b/src/mds/SnapTable.cc index 5481ca8efbe15..547f262e63c23 100644 --- a/src/mds/SnapTable.cc +++ b/src/mds/SnapTable.cc @@ -40,7 +40,7 @@ snapid_t SnapTable::create(inodeno_t base, const string& name, utime_t stamp) snapid_t sn = ++last_snap; snaps[sn].snapid = sn; - snaps[sn].base = base; + snaps[sn].dirino = base; snaps[sn].name = name; snaps[sn].stamp = stamp; version++; diff --git a/src/mds/journal.cc b/src/mds/journal.cc index 62c0ce037ac94..cfddab0820a86 100644 --- a/src/mds/journal.cc +++ b/src/mds/journal.cc @@ -639,7 +639,7 @@ void ESnap::replay(MDS *mds) assert(version-1 == mds->snaptable->get_version()); if (create) { - snapid_t s = mds->snaptable->create(snap.base, snap.name, snap.stamp); + snapid_t s = mds->snaptable->create(snap.dirino, snap.name, snap.stamp); assert(s == snap.snapid); } else { mds->snaptable->remove(snap.snapid); diff --git a/src/mds/snap.cc b/src/mds/snap.cc new file mode 100644 index 0000000000000..3479d20e62f21 --- /dev/null +++ b/src/mds/snap.cc @@ -0,0 +1,72 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004- Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "snap.h" +#include "MDCache.h" +#include "MDS.h" + + +/* + * SnapRealm + */ + +#define dout(x) if (x < g_conf.debug_mds) *_dout << dbeginl << g_clock.now() \ + << " mds" << mdcache->mds->get_nodeid() \ + << ".snaprealm(" << dirino << ") " + +bool SnapRealm::open_parents(MDRequest *mdr) +{ + dout(10) << "open_parents" << dendl; + for (multimap::iterator p = parents.begin(); + p != parents.end(); + p++) { + CInode *parent = mdcache->get_inode(p->second.dirino); + if (parent) + continue; + mdcache->open_remote_ino(p->second.dirino, mdr, + new C_MDS_RetryRequest(mdcache, mdr)); + return false; + } + return true; +} + +/* + * get list of snaps for this realm. we must include parents' snaps + * for the intervals during which they were our parent. + */ +void SnapRealm::get_snap_list(set &s) +{ + // start with my snaps + for (map::iterator p = snaps.begin(); + p != snaps.end(); + p++) + s.insert(p->first); + + // include parent snaps + for (multimap::iterator p = parents.begin(); + p != parents.end(); + p++) { + CInode *parent = mdcache->get_inode(p->second.dirino); + assert(parent); // call open_parents first! + assert(parent->snaprealm); + + for (map::iterator q = parent->snaprealm->snaps.begin(); + q != parent->snaprealm->snaps.end(); + q++) + if (q->first <= p->first && + q->first >= p->second.first) + s.insert(q->first); + } + dout(10) << "build_snap_list " << s << dendl; +} diff --git a/src/mds/snap.h b/src/mds/snap.h index cf446d14bc3ea..999e37b2154a0 100644 --- a/src/mds/snap.h +++ b/src/mds/snap.h @@ -15,21 +15,27 @@ #ifndef __CEPH_MDS_SNAP_H #define __CEPH_MDS_SNAP_H +#include "mdstypes.h" +#include "include/xlist.h" + +/* + * generic snap descriptor. + */ struct SnapInfo { snapid_t snapid; - inodeno_t base; + inodeno_t dirino; utime_t stamp; string name; void encode(bufferlist& bl) const { ::encode(snapid, bl); - ::encode(base, bl); + ::encode(dirino, bl); ::encode(stamp, bl); ::encode(name, bl); } void decode(bufferlist::iterator& bl) { ::decode(snapid, bl); - ::decode(base, bl); + ::decode(dirino, bl); ::decode(stamp, bl); ::decode(name, bl); } @@ -37,7 +43,64 @@ struct SnapInfo { WRITE_CLASS_ENCODER(SnapInfo) inline ostream& operator<<(ostream& out, const SnapInfo &sn) { - return out << "snap(" << sn.snapid << " " << sn.base << " '" << sn.name << "' " << sn.stamp << ")"; + return out << "snap(" << sn.snapid + << " " << sn.dirino + << " '" << sn.name + << "' " << sn.stamp << ")"; } + + +/* + * SnapRealm - a subtree that shares the same set of snapshots. + */ +struct SnapRealm; +struct CapabilityGroup; +class CInode; +class MDCache; +class MDRequest; + +struct snaplink_t { + inodeno_t dirino; + snapid_t first; +}; + +struct SnapRealm { + // realm state + inodeno_t dirino; + map snaps; + multimap parents, children; // key is "last" (or NOSNAP) + + // in-memory state + MDCache *mdcache; + CInode *inode; + + // caches? + //set cached_snaps; + //set cached_active_children; // active children that are currently open + + xlist inodes_with_caps; // for efficient realm splits + map client_cap_groups; // to identify clients who need snap notifications + + SnapRealm(inodeno_t i, MDCache *c, CInode *in) : dirino(i), mdcache(c), inode(in) {} + + bool open_parents(MDRequest *mdr); + void get_snap_list(set& s); +}; + + + +/* + * CapabilityGroup - group per-realm, per-client caps for efficient + * client snap notifications. + */ +struct Capability; + +struct CapabilityGroup { + int client; + xlist caps; + SnapRealm *realm; +}; + + #endif -- 2.39.5