]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
mds: the reconnect trainwreck continues
authorSage Weil <sage@newdream.net>
Thu, 31 Jul 2008 23:49:55 +0000 (16:49 -0700)
committerSage Weil <sage@newdream.net>
Thu, 31 Jul 2008 23:49:55 +0000 (16:49 -0700)
13 files changed:
src/TODO
src/client/Client.cc
src/include/ceph_fs.h
src/include/types.h
src/mds/Locker.cc
src/mds/MDCache.cc
src/mds/MDCache.h
src/mds/Migrator.cc
src/mds/Server.cc
src/mds/mdstypes.h
src/messages/MClientFileCaps.h
src/messages/MClientReconnect.h
src/messages/MMDSCacheRejoin.h

index d1934bf29e8e108fcf31c42959e8f99aa47f5aaf..f888e000f2da4837efc77706dd40c5b5fb742528 100644 (file)
--- a/src/TODO
+++ b/src/TODO
@@ -28,6 +28,15 @@ snaps on mds
 - client reconnect
   - esp cap claim
 
+- client snap caps
+  - NO CAP STATE FOR SNAPPED INODES.  
+  - mds grants open access (yes/no), but there is no state, since there is no concurrency.
+    (mds doesn't grant access until filelock it is readable, i.e., snapped data has flushed)
+  - client _should_ only send FLUSHSNAP _after_ data is flushed.  this will require much more sophisticated barriers in the client's cache.
+  - reconnect should map caps into snaprealms, and include snaprealm state, such that those can be brought in sync w/ the mds.
+  - reconnect does _not_ need any per-cap snap-related info.
+
+
 /- call open_parents() where needed.
   - what about during recovery?  e.g. client reconnected caps...
 - mds server ops
index c7b8ac80449093ef822c941d15ace2dd83ee773b..de01689b104c610436da48790b95e8939fa0ab1a 100644 (file)
@@ -1218,10 +1218,10 @@ void Client::send_reconnect(int mds)
        dout(10) << " path on " << p->first << " is " << path << dendl;
 
        in->caps[mds]->seq = 0;  // reset seq.
-       m->add_cap(p->first, path.get_path(),   // ino
+       m->add_cap(p->first.ino, path.get_path(),   // ino
                   in->caps_wanted(), // wanted
                   in->caps[mds]->issued,     // issued
-                  in->inode.size, in->inode.mtime, in->inode.atime);
+                  in->inode.size, in->inode.mtime, in->inode.atime, in->snaprealm->ino);
       }
       if (in->exporting_mds == mds) {
        dout(10) << " clearing exporting_caps on " << p->first << dendl;
@@ -1441,7 +1441,7 @@ void Client::check_caps(Inode *in, bool flush_snap)
       op = CEPH_CAP_OP_RELEASE;
     dout(10) << "  op = " << op << dendl;
     MClientFileCaps *m = new MClientFileCaps(op,
-                                            in->inode, in->snapid,
+                                            in->inode,
                                             0,
                                              cap->seq,
                                              cap->issued,
index b779835bb292d118c8d17f63c50809ecd21ab507..4ba2c3ef5714eeb4edc926c7bcee55a710117107 100644 (file)
@@ -808,7 +808,6 @@ inline static const char* ceph_cap_op_name(int op) {
 struct ceph_mds_file_caps {
        __le32 op;
        __le64 ino;
-       __le64 snapid;
        __le32 seq;
        __le32 caps, wanted;
        __le64 size, max_size;
@@ -837,9 +836,14 @@ struct ceph_mds_cap_reconnect {
        __le32 issued;
        __le64 size;
        struct ceph_timespec mtime, atime;
+       __le64 snaprealm;
 } __attribute__ ((packed));
 /* followed by encoded string */
 
+struct ceph_mds_snaprealm_reconnect {
+       __le64 seq;
+       __le64 parent;  /* parent realm */
+} __attribute__ ((packed));
 
 /*
  * snaps
index 59a7aee966d84eefc999181d73019226e40a0979..3a70c1a98f1ef4b0d3c0521f0b6af12d42c37e67 100644 (file)
@@ -118,6 +118,7 @@ WRITE_RAW_ENCODER(ceph_mds_lease)
 WRITE_RAW_ENCODER(ceph_mds_reply_head)
 WRITE_RAW_ENCODER(ceph_mds_reply_inode)
 WRITE_RAW_ENCODER(ceph_mds_cap_reconnect)
+WRITE_RAW_ENCODER(ceph_mds_snaprealm_reconnect)
 WRITE_RAW_ENCODER(ceph_frag_tree_split)
 WRITE_RAW_ENCODER(ceph_inopath_item)
 
index 2c50eb5bc42c4155a9175ae323cc9610b6f471de..15e0b9546661d5aa7842bff2bd7e6968fc9b2db3 100644 (file)
@@ -619,7 +619,7 @@ bool Locker::issue_caps(CInode *in)
                << " new pending " << cap_string(cap->pending()) << " was " << cap_string(before) 
                << dendl;
         mds->send_message_client(new MClientFileCaps(CEPH_CAP_OP_GRANT,
-                                                    in->inode, in->last,
+                                                    in->inode,
                                                     in->find_snaprealm()->inode->ino(),
                                                     cap->get_last_seq(),
                                                     cap->pending(),
@@ -642,7 +642,7 @@ void Locker::issue_truncate(CInode *in)
        it++) {
     Capability *cap = it->second;
     mds->send_message_client(new MClientFileCaps(CEPH_CAP_OP_TRUNC,
-                                                in->inode, in->last,
+                                                in->inode,
                                                 in->find_snaprealm()->inode->ino(),
                                                 cap->get_last_seq(),
                                                 cap->pending(),
@@ -902,7 +902,7 @@ void Locker::share_inode_max_size(CInode *in)
     if (cap->pending() & CEPH_CAP_WR) {
       dout(10) << "share_inode_max_size with client" << client << dendl;
       mds->send_message_client(new MClientFileCaps(CEPH_CAP_OP_GRANT,
-                                                  in->inode, in->last,
+                                                  in->inode,
                                                   in->find_snaprealm()->inode->ino(),
                                                   cap->get_last_seq(),
                                                   cap->pending(),
index 95e728caf3339aa6d223196154385e9f3a9cbef8..384862b4d71e629c044d1166b0900ff10225479d 100644 (file)
@@ -2717,7 +2717,7 @@ void MDCache::handle_cache_rejoin_weak(MMDSCacheRejoin *weak)
     ack = new MMDSCacheRejoin(MMDSCacheRejoin::OP_ACK);
 
     // check cap exports
-    for (map<vinodeno_t,map<int,ceph_mds_cap_reconnect> >::iterator p = weak->cap_exports.begin();
+    for (map<inodeno_t,map<int,ceph_mds_cap_reconnect> >::iterator p = weak->cap_exports.begin();
         p != weak->cap_exports.end();
         ++p) {
       CInode *in = get_inode(p->first);
@@ -2733,7 +2733,7 @@ void MDCache::handle_cache_rejoin_weak(MMDSCacheRejoin *weak)
     assert(mds->is_rejoin());
 
     // check cap exports.
-    for (map<vinodeno_t,map<int,ceph_mds_cap_reconnect> >::iterator p = weak->cap_exports.begin();
+    for (map<inodeno_t,map<int,ceph_mds_cap_reconnect> >::iterator p = weak->cap_exports.begin();
         p != weak->cap_exports.end();
         ++p) {
       CInode *in = get_inode(p->first);
@@ -2869,13 +2869,13 @@ void MDCache::handle_cache_rejoin_weak(MMDSCacheRejoin *weak)
  * returns a C_Gather* is there is work to do.  caller is responsible for setting
  * the C_Gather completer.
  */
-C_Gather *MDCache::parallel_fetch(map<vinodeno_t,string>& pathmap)
+C_Gather *MDCache::parallel_fetch(map<inodeno_t,string>& pathmap)
 {
   dout(10) << "parallel_fetch on " << pathmap.size() << " paths" << dendl;
 
   // scan list
   set<CDir*> fetch_queue;
-  map<vinodeno_t,string>::iterator p = pathmap.begin();
+  map<inodeno_t,string>::iterator p = pathmap.begin();
   while (p != pathmap.end()) {
     CInode *in = get_inode(p->first);
     if (in) {
@@ -3434,7 +3434,7 @@ void MDCache::rejoin_gather_finish()
   
   // process cap imports
   //  ino -> client -> frommds -> capex
-  for (map<vinodeno_t,map<int, map<int,ceph_mds_cap_reconnect> > >::iterator p = cap_imports.begin();
+  for (map<inodeno_t,map<int, map<int,ceph_mds_cap_reconnect> > >::iterator p = cap_imports.begin();
        p != cap_imports.end();
        ++p) {
     CInode *in = get_inode(p->first);
@@ -3476,7 +3476,7 @@ void MDCache::rejoin_import_cap(CInode *in, int client, ceph_mds_cap_reconnect&
   // send IMPORT
   SnapRealm *realm = in->find_snaprealm();
   MClientFileCaps *reap = new MClientFileCaps(CEPH_CAP_OP_IMPORT,
-                                             in->inode, in->last,
+                                             in->inode,
                                              realm->inode->ino(),
                                              cap->get_last_seq(),
                                              cap->pending(),
index fe8c5a88427470ad99b9b067bf2dbae0301f0a5b..8b933d2c5d97b0bae946353826a883165fdceab2 100644 (file)
@@ -617,11 +617,11 @@ protected:
   set<int> rejoin_sent;        // nodes i sent a rejoin to
   set<int> rejoin_ack_gather;  // nodes from whom i need a rejoin ack
 
-  map<vinodeno_t,map<int,ceph_mds_cap_reconnect> > cap_exports; // ino -> client -> capex
-  map<vinodeno_t,string> cap_export_paths;
+  map<inodeno_t,map<int,ceph_mds_cap_reconnect> > cap_exports; // ino -> client -> capex
+  map<inodeno_t,string> cap_export_paths;
 
-  map<vinodeno_t,map<int,map<int,ceph_mds_cap_reconnect> > > cap_imports;  // ino -> client -> frommds -> capex
-  map<vinodeno_t,string> cap_import_paths;
+  map<inodeno_t,map<int,map<int,ceph_mds_cap_reconnect> > > cap_imports;  // ino -> client -> frommds -> capex
+  map<inodeno_t,string> cap_import_paths;
   
   set<CInode*> rejoin_undef_inodes;
   set<CInode*> rejoin_potential_updated_scatterlocks;
@@ -641,14 +641,14 @@ protected:
 public:
   void rejoin_gather_finish();
   void rejoin_send_rejoins();
-  void rejoin_export_caps(vinodeno_t vino, int client, cap_reconnect_t& icr) {
-    cap_exports[vino][client] = icr.capinfo;
-    cap_export_paths[vino] = icr.path;
+  void rejoin_export_caps(inodeno_t ino, int client, cap_reconnect_t& icr) {
+    cap_exports[ino][client] = icr.capinfo;
+    cap_export_paths[ino] = icr.path;
   }
-  void rejoin_recovered_caps(vinodeno_t vino, int client, cap_reconnect_t& icr, 
+  void rejoin_recovered_caps(inodeno_t ino, int client, cap_reconnect_t& icr, 
                             int frommds=-1) {
-    cap_imports[vino][client][frommds] = icr.capinfo;
-    cap_import_paths[vino] = icr.path;
+    cap_imports[ino][client][frommds] = icr.capinfo;
+    cap_import_paths[ino] = icr.path;
   }
   void rejoin_import_cap(CInode *in, int client, ceph_mds_cap_reconnect& icr, int frommds);
 
@@ -822,7 +822,7 @@ public:
                          vector<Anchor>& anchortrace,
                          Context *onfinish);
 
-  C_Gather *parallel_fetch(map<vinodeno_t,string>& pathmap);
+  C_Gather *parallel_fetch(map<inodeno_t,string>& pathmap);
 
   void make_trace(vector<CDentry*>& trace, CInode *in);
   
index f1fc8792dc1d01f3c426bec8678d5c2ddc995568..2b48ebda5067d55db361c39733ffcd987d046563 100644 (file)
@@ -896,7 +896,7 @@ void Migrator::finish_export_inode_caps(CInode *in)
     dout(7) << "finish_export_inode telling client" << it->first
            << " exported caps on " << *in << dendl;
     MClientFileCaps *m = new MClientFileCaps(CEPH_CAP_OP_EXPORT,
-                                            in->inode, in->last,
+                                            in->inode,
                                             in->find_snaprealm()->inode->ino(),
                                              cap->get_last_seq(), 
                                              cap->pending(),
@@ -2045,7 +2045,7 @@ void Migrator::finish_import_inode_caps(CInode *in, int from,
 
     SnapRealm *realm = in->find_snaprealm();
     MClientFileCaps *caps = new MClientFileCaps(CEPH_CAP_OP_IMPORT,
-                                               in->inode, in->last,
+                                               in->inode,
                                                realm->inode->ino(),
                                                cap->get_last_seq(),
                                                cap->pending(),
index 13e2e6f53507ebd3b064dfca243cd822f27199ae..d96fe0266572749a50e4b89430d42f8d3c462482 100644 (file)
@@ -376,7 +376,7 @@ void Server::handle_client_reconnect(MClientReconnect *m)
   } else {
     
     // caps
-    for (map<vinodeno_t, cap_reconnect_t>::iterator p = m->caps.begin();
+    for (map<inodeno_t, cap_reconnect_t>::iterator p = m->caps.begin();
         p != m->caps.end();
         ++p) {
       CInode *in = mdcache->get_inode(p->first);
@@ -399,9 +399,9 @@ void Server::handle_client_reconnect(MClientReconnect *m)
        // mark client caps stale.
        inode_t fake_inode;
        memset(&fake_inode, 0, sizeof(fake_inode));
-       fake_inode.ino = p->first.ino;
+       fake_inode.ino = p->first;
        MClientFileCaps *stale = new MClientFileCaps(CEPH_CAP_OP_EXPORT,
-                                                    fake_inode, p->first.snapid,
+                                                    fake_inode,
                                                     0,
                                                     0,
                                                     0,                // doesn't matter.
index 24708e1667e3133b25cc371cec35958297d624a5..1772291bd5416012559d3d609b57faef7e955920 100644 (file)
@@ -541,13 +541,14 @@ struct cap_reconnect_t {
   ceph_mds_cap_reconnect capinfo;
 
   cap_reconnect_t() {}
-  cap_reconnect_t(const string& p, int w, int i, uint64_t sz, utime_t mt, utime_t at) : 
+  cap_reconnect_t(const string& p, int w, int i, uint64_t sz, utime_t mt, utime_t at, inodeno_t sr) : 
     path(p) {
     capinfo.wanted = w;
     capinfo.issued = i;
     capinfo.size = sz;
     capinfo.mtime = mt;
     capinfo.atime = at;
+    capinfo.snaprealm = sr;
   }
 
   void encode(bufferlist& bl) const {
@@ -562,6 +563,7 @@ struct cap_reconnect_t {
 WRITE_CLASS_ENCODER(cap_reconnect_t)
 
 
+
 // ================================================================
 // dir frag
 
index 25440c6cf28f527130e192369961841b672f353a..043722c65f42129025aa2873326b304ebedbbcf4 100644 (file)
@@ -30,7 +30,6 @@ class MClientFileCaps : public Message {
   capseq_t get_mseq() { return h.migrate_seq; }
 
   inodeno_t get_ino() { return inodeno_t(h.ino); }
-  snapid_t get_snapid() { return snapid_t(h.snapid); }
 
   __u64 get_size() { return h.size;  }
   __u64 get_max_size() { return h.max_size;  }
@@ -63,7 +62,6 @@ class MClientFileCaps : public Message {
   MClientFileCaps() {}
   MClientFileCaps(int op,
                  inode_t& inode,
-                 snapid_t snapid,
                  inodeno_t realm,
                   long seq,
                   int caps,
@@ -72,7 +70,6 @@ class MClientFileCaps : public Message {
     Message(CEPH_MSG_CLIENT_FILECAPS) {
     h.op = op;
     h.ino = inode.ino;
-    h.snapid = snapid;
     h.seq = seq;
     h.caps = caps;
     h.wanted = wanted;
index 165dec0007b156be4bf821d3a4ddc58ce9e689bd..841535fe62a04056e0070351a343b15a975bdee6 100644 (file)
@@ -20,8 +20,9 @@
 
 class MClientReconnect : public Message {
 public:
-  map<vinodeno_t, cap_reconnect_t>  caps;
   __u8 closed;  // true if this session was closed by the client.
+  map<inodeno_t, cap_reconnect_t>  caps;   // only head inodes
+  map<inodeno_t, ceph_mds_snaprealm_reconnect> realms;
 
   MClientReconnect() : Message(CEPH_MSG_CLIENT_RECONNECT),
                       closed(false) { }
@@ -33,10 +34,15 @@ public:
        << caps.size() << " caps)";
   }
 
-  void add_cap(vinodeno_t ino, const string& path,
+  void add_cap(inodeno_t ino, const string& path,
               int wanted, int issued,
-              loff_t sz, utime_t mt, utime_t at) {
-    caps[ino] = cap_reconnect_t(path, wanted, issued, sz, mt, at);
+              loff_t sz, utime_t mt, utime_t at,
+              inodeno_t sr) {
+    caps[ino] = cap_reconnect_t(path, wanted, issued, sz, mt, at, sr);
+  }
+  void add_snaprealm(inodeno_t ino, snapid_t seq, inodeno_t parent) {
+    realms[ino].seq = seq;
+    realms[ino].parent = parent;
   }
 
   void encode_payload() {
index e937bc13c3205c9ea26ce53e79ccb4592241ca6a..3ee487a5321a08edf6412a27f40b79fc03187cfe 100644 (file)
@@ -142,8 +142,8 @@ class MMDSCacheRejoin : public Message {
 
   // open
   bufferlist cap_export_bl;
-  map<vinodeno_t,map<__s32, ceph_mds_cap_reconnect> > cap_exports;
-  map<vinodeno_t,string> cap_export_paths;
+  map<inodeno_t,map<__s32, ceph_mds_cap_reconnect> > cap_exports;
+  map<inodeno_t,string> cap_export_paths;
 
   // full
   bufferlist inode_base;