From 779f9703a9eb588200e8c1a72d46a55f2d9d2df7 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 31 Jul 2008 16:49:55 -0700 Subject: [PATCH] mds: the reconnect trainwreck continues --- src/TODO | 9 +++++++++ src/client/Client.cc | 6 +++--- src/include/ceph_fs.h | 6 +++++- src/include/types.h | 1 + src/mds/Locker.cc | 6 +++--- src/mds/MDCache.cc | 12 ++++++------ src/mds/MDCache.h | 22 +++++++++++----------- src/mds/Migrator.cc | 4 ++-- src/mds/Server.cc | 6 +++--- src/mds/mdstypes.h | 4 +++- src/messages/MClientFileCaps.h | 3 --- src/messages/MClientReconnect.h | 14 ++++++++++---- src/messages/MMDSCacheRejoin.h | 4 ++-- 13 files changed, 58 insertions(+), 39 deletions(-) diff --git a/src/TODO b/src/TODO index d1934bf29e8e1..f888e000f2da4 100644 --- a/src/TODO +++ b/src/TODO @@ -28,6 +28,15 @@ snaps on mds - client reconnect - esp cap claim +- client snap caps + - NO CAP STATE FOR SNAPPED INODES. + - mds grants open access (yes/no), but there is no state, since there is no concurrency. + (mds doesn't grant access until filelock it is readable, i.e., snapped data has flushed) + - client _should_ only send FLUSHSNAP _after_ data is flushed. this will require much more sophisticated barriers in the client's cache. + - reconnect should map caps into snaprealms, and include snaprealm state, such that those can be brought in sync w/ the mds. + - reconnect does _not_ need any per-cap snap-related info. + + /- call open_parents() where needed. - what about during recovery? e.g. client reconnected caps... - mds server ops diff --git a/src/client/Client.cc b/src/client/Client.cc index c7b8ac8044909..de01689b104c6 100644 --- a/src/client/Client.cc +++ b/src/client/Client.cc @@ -1218,10 +1218,10 @@ void Client::send_reconnect(int mds) dout(10) << " path on " << p->first << " is " << path << dendl; in->caps[mds]->seq = 0; // reset seq. - m->add_cap(p->first, path.get_path(), // ino + m->add_cap(p->first.ino, path.get_path(), // ino in->caps_wanted(), // wanted in->caps[mds]->issued, // issued - in->inode.size, in->inode.mtime, in->inode.atime); + in->inode.size, in->inode.mtime, in->inode.atime, in->snaprealm->ino); } if (in->exporting_mds == mds) { dout(10) << " clearing exporting_caps on " << p->first << dendl; @@ -1441,7 +1441,7 @@ void Client::check_caps(Inode *in, bool flush_snap) op = CEPH_CAP_OP_RELEASE; dout(10) << " op = " << op << dendl; MClientFileCaps *m = new MClientFileCaps(op, - in->inode, in->snapid, + in->inode, 0, cap->seq, cap->issued, diff --git a/src/include/ceph_fs.h b/src/include/ceph_fs.h index b779835bb292d..4ba2c3ef5714e 100644 --- a/src/include/ceph_fs.h +++ b/src/include/ceph_fs.h @@ -808,7 +808,6 @@ inline static const char* ceph_cap_op_name(int op) { struct ceph_mds_file_caps { __le32 op; __le64 ino; - __le64 snapid; __le32 seq; __le32 caps, wanted; __le64 size, max_size; @@ -837,9 +836,14 @@ struct ceph_mds_cap_reconnect { __le32 issued; __le64 size; struct ceph_timespec mtime, atime; + __le64 snaprealm; } __attribute__ ((packed)); /* followed by encoded string */ +struct ceph_mds_snaprealm_reconnect { + __le64 seq; + __le64 parent; /* parent realm */ +} __attribute__ ((packed)); /* * snaps diff --git a/src/include/types.h b/src/include/types.h index 59a7aee966d84..3a70c1a98f1ef 100644 --- a/src/include/types.h +++ b/src/include/types.h @@ -118,6 +118,7 @@ WRITE_RAW_ENCODER(ceph_mds_lease) WRITE_RAW_ENCODER(ceph_mds_reply_head) WRITE_RAW_ENCODER(ceph_mds_reply_inode) WRITE_RAW_ENCODER(ceph_mds_cap_reconnect) +WRITE_RAW_ENCODER(ceph_mds_snaprealm_reconnect) WRITE_RAW_ENCODER(ceph_frag_tree_split) WRITE_RAW_ENCODER(ceph_inopath_item) diff --git a/src/mds/Locker.cc b/src/mds/Locker.cc index 2c50eb5bc42c4..15e0b9546661d 100644 --- a/src/mds/Locker.cc +++ b/src/mds/Locker.cc @@ -619,7 +619,7 @@ bool Locker::issue_caps(CInode *in) << " new pending " << cap_string(cap->pending()) << " was " << cap_string(before) << dendl; mds->send_message_client(new MClientFileCaps(CEPH_CAP_OP_GRANT, - in->inode, in->last, + in->inode, in->find_snaprealm()->inode->ino(), cap->get_last_seq(), cap->pending(), @@ -642,7 +642,7 @@ void Locker::issue_truncate(CInode *in) it++) { Capability *cap = it->second; mds->send_message_client(new MClientFileCaps(CEPH_CAP_OP_TRUNC, - in->inode, in->last, + in->inode, in->find_snaprealm()->inode->ino(), cap->get_last_seq(), cap->pending(), @@ -902,7 +902,7 @@ void Locker::share_inode_max_size(CInode *in) if (cap->pending() & CEPH_CAP_WR) { dout(10) << "share_inode_max_size with client" << client << dendl; mds->send_message_client(new MClientFileCaps(CEPH_CAP_OP_GRANT, - in->inode, in->last, + in->inode, in->find_snaprealm()->inode->ino(), cap->get_last_seq(), cap->pending(), diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc index 95e728caf3339..384862b4d71e6 100644 --- a/src/mds/MDCache.cc +++ b/src/mds/MDCache.cc @@ -2717,7 +2717,7 @@ void MDCache::handle_cache_rejoin_weak(MMDSCacheRejoin *weak) ack = new MMDSCacheRejoin(MMDSCacheRejoin::OP_ACK); // check cap exports - for (map >::iterator p = weak->cap_exports.begin(); + for (map >::iterator p = weak->cap_exports.begin(); p != weak->cap_exports.end(); ++p) { CInode *in = get_inode(p->first); @@ -2733,7 +2733,7 @@ void MDCache::handle_cache_rejoin_weak(MMDSCacheRejoin *weak) assert(mds->is_rejoin()); // check cap exports. - for (map >::iterator p = weak->cap_exports.begin(); + for (map >::iterator p = weak->cap_exports.begin(); p != weak->cap_exports.end(); ++p) { CInode *in = get_inode(p->first); @@ -2869,13 +2869,13 @@ void MDCache::handle_cache_rejoin_weak(MMDSCacheRejoin *weak) * returns a C_Gather* is there is work to do. caller is responsible for setting * the C_Gather completer. */ -C_Gather *MDCache::parallel_fetch(map& pathmap) +C_Gather *MDCache::parallel_fetch(map& pathmap) { dout(10) << "parallel_fetch on " << pathmap.size() << " paths" << dendl; // scan list set fetch_queue; - map::iterator p = pathmap.begin(); + map::iterator p = pathmap.begin(); while (p != pathmap.end()) { CInode *in = get_inode(p->first); if (in) { @@ -3434,7 +3434,7 @@ void MDCache::rejoin_gather_finish() // process cap imports // ino -> client -> frommds -> capex - for (map > >::iterator p = cap_imports.begin(); + for (map > >::iterator p = cap_imports.begin(); p != cap_imports.end(); ++p) { CInode *in = get_inode(p->first); @@ -3476,7 +3476,7 @@ void MDCache::rejoin_import_cap(CInode *in, int client, ceph_mds_cap_reconnect& // send IMPORT SnapRealm *realm = in->find_snaprealm(); MClientFileCaps *reap = new MClientFileCaps(CEPH_CAP_OP_IMPORT, - in->inode, in->last, + in->inode, realm->inode->ino(), cap->get_last_seq(), cap->pending(), diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h index fe8c5a8842747..8b933d2c5d97b 100644 --- a/src/mds/MDCache.h +++ b/src/mds/MDCache.h @@ -617,11 +617,11 @@ protected: set rejoin_sent; // nodes i sent a rejoin to set rejoin_ack_gather; // nodes from whom i need a rejoin ack - map > cap_exports; // ino -> client -> capex - map cap_export_paths; + map > cap_exports; // ino -> client -> capex + map cap_export_paths; - map > > cap_imports; // ino -> client -> frommds -> capex - map cap_import_paths; + map > > cap_imports; // ino -> client -> frommds -> capex + map cap_import_paths; set rejoin_undef_inodes; set rejoin_potential_updated_scatterlocks; @@ -641,14 +641,14 @@ protected: public: void rejoin_gather_finish(); void rejoin_send_rejoins(); - void rejoin_export_caps(vinodeno_t vino, int client, cap_reconnect_t& icr) { - cap_exports[vino][client] = icr.capinfo; - cap_export_paths[vino] = icr.path; + void rejoin_export_caps(inodeno_t ino, int client, cap_reconnect_t& icr) { + cap_exports[ino][client] = icr.capinfo; + cap_export_paths[ino] = icr.path; } - void rejoin_recovered_caps(vinodeno_t vino, int client, cap_reconnect_t& icr, + void rejoin_recovered_caps(inodeno_t ino, int client, cap_reconnect_t& icr, int frommds=-1) { - cap_imports[vino][client][frommds] = icr.capinfo; - cap_import_paths[vino] = icr.path; + cap_imports[ino][client][frommds] = icr.capinfo; + cap_import_paths[ino] = icr.path; } void rejoin_import_cap(CInode *in, int client, ceph_mds_cap_reconnect& icr, int frommds); @@ -822,7 +822,7 @@ public: vector& anchortrace, Context *onfinish); - C_Gather *parallel_fetch(map& pathmap); + C_Gather *parallel_fetch(map& pathmap); void make_trace(vector& trace, CInode *in); diff --git a/src/mds/Migrator.cc b/src/mds/Migrator.cc index f1fc8792dc1d0..2b48ebda5067d 100644 --- a/src/mds/Migrator.cc +++ b/src/mds/Migrator.cc @@ -896,7 +896,7 @@ void Migrator::finish_export_inode_caps(CInode *in) dout(7) << "finish_export_inode telling client" << it->first << " exported caps on " << *in << dendl; MClientFileCaps *m = new MClientFileCaps(CEPH_CAP_OP_EXPORT, - in->inode, in->last, + in->inode, in->find_snaprealm()->inode->ino(), cap->get_last_seq(), cap->pending(), @@ -2045,7 +2045,7 @@ void Migrator::finish_import_inode_caps(CInode *in, int from, SnapRealm *realm = in->find_snaprealm(); MClientFileCaps *caps = new MClientFileCaps(CEPH_CAP_OP_IMPORT, - in->inode, in->last, + in->inode, realm->inode->ino(), cap->get_last_seq(), cap->pending(), diff --git a/src/mds/Server.cc b/src/mds/Server.cc index 13e2e6f53507e..d96fe02665727 100644 --- a/src/mds/Server.cc +++ b/src/mds/Server.cc @@ -376,7 +376,7 @@ void Server::handle_client_reconnect(MClientReconnect *m) } else { // caps - for (map::iterator p = m->caps.begin(); + for (map::iterator p = m->caps.begin(); p != m->caps.end(); ++p) { CInode *in = mdcache->get_inode(p->first); @@ -399,9 +399,9 @@ void Server::handle_client_reconnect(MClientReconnect *m) // mark client caps stale. inode_t fake_inode; memset(&fake_inode, 0, sizeof(fake_inode)); - fake_inode.ino = p->first.ino; + fake_inode.ino = p->first; MClientFileCaps *stale = new MClientFileCaps(CEPH_CAP_OP_EXPORT, - fake_inode, p->first.snapid, + fake_inode, 0, 0, 0, // doesn't matter. diff --git a/src/mds/mdstypes.h b/src/mds/mdstypes.h index 24708e1667e31..1772291bd5416 100644 --- a/src/mds/mdstypes.h +++ b/src/mds/mdstypes.h @@ -541,13 +541,14 @@ struct cap_reconnect_t { ceph_mds_cap_reconnect capinfo; cap_reconnect_t() {} - cap_reconnect_t(const string& p, int w, int i, uint64_t sz, utime_t mt, utime_t at) : + cap_reconnect_t(const string& p, int w, int i, uint64_t sz, utime_t mt, utime_t at, inodeno_t sr) : path(p) { capinfo.wanted = w; capinfo.issued = i; capinfo.size = sz; capinfo.mtime = mt; capinfo.atime = at; + capinfo.snaprealm = sr; } void encode(bufferlist& bl) const { @@ -562,6 +563,7 @@ struct cap_reconnect_t { WRITE_CLASS_ENCODER(cap_reconnect_t) + // ================================================================ // dir frag diff --git a/src/messages/MClientFileCaps.h b/src/messages/MClientFileCaps.h index 25440c6cf28f5..043722c65f421 100644 --- a/src/messages/MClientFileCaps.h +++ b/src/messages/MClientFileCaps.h @@ -30,7 +30,6 @@ class MClientFileCaps : public Message { capseq_t get_mseq() { return h.migrate_seq; } inodeno_t get_ino() { return inodeno_t(h.ino); } - snapid_t get_snapid() { return snapid_t(h.snapid); } __u64 get_size() { return h.size; } __u64 get_max_size() { return h.max_size; } @@ -63,7 +62,6 @@ class MClientFileCaps : public Message { MClientFileCaps() {} MClientFileCaps(int op, inode_t& inode, - snapid_t snapid, inodeno_t realm, long seq, int caps, @@ -72,7 +70,6 @@ class MClientFileCaps : public Message { Message(CEPH_MSG_CLIENT_FILECAPS) { h.op = op; h.ino = inode.ino; - h.snapid = snapid; h.seq = seq; h.caps = caps; h.wanted = wanted; diff --git a/src/messages/MClientReconnect.h b/src/messages/MClientReconnect.h index 165dec0007b15..841535fe62a04 100644 --- a/src/messages/MClientReconnect.h +++ b/src/messages/MClientReconnect.h @@ -20,8 +20,9 @@ class MClientReconnect : public Message { public: - map caps; __u8 closed; // true if this session was closed by the client. + map caps; // only head inodes + map realms; MClientReconnect() : Message(CEPH_MSG_CLIENT_RECONNECT), closed(false) { } @@ -33,10 +34,15 @@ public: << caps.size() << " caps)"; } - void add_cap(vinodeno_t ino, const string& path, + void add_cap(inodeno_t ino, const string& path, int wanted, int issued, - loff_t sz, utime_t mt, utime_t at) { - caps[ino] = cap_reconnect_t(path, wanted, issued, sz, mt, at); + loff_t sz, utime_t mt, utime_t at, + inodeno_t sr) { + caps[ino] = cap_reconnect_t(path, wanted, issued, sz, mt, at, sr); + } + void add_snaprealm(inodeno_t ino, snapid_t seq, inodeno_t parent) { + realms[ino].seq = seq; + realms[ino].parent = parent; } void encode_payload() { diff --git a/src/messages/MMDSCacheRejoin.h b/src/messages/MMDSCacheRejoin.h index e937bc13c3205..3ee487a5321a0 100644 --- a/src/messages/MMDSCacheRejoin.h +++ b/src/messages/MMDSCacheRejoin.h @@ -142,8 +142,8 @@ class MMDSCacheRejoin : public Message { // open bufferlist cap_export_bl; - map > cap_exports; - map cap_export_paths; + map > cap_exports; + map cap_export_paths; // full bufferlist inode_base; -- 2.39.5