From cd41295f53f7aed647e3a9904d762e3453c564e1 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Fri, 6 Mar 2009 16:25:04 -0800
Subject: [PATCH] mds: incorporate reconnect cap info after requests replay if
 ino originally missing

If the client reasserts caps from replayed requests, the inodes don't yet
exists during the reconnect (or even rejoin) stage.  So, if we don't find
the inode, keep the reconnect info around.  When processing a replayed
request in CInode::encode_inodestat, set wanted/issued appropriately.

This is incomplete.  We really need something to ensure we deal with
replayed requests before new requests are handled.. and on a cluster-wide
basis, since requests may involve slave requests to other mds's.

We also should clean up reconnects unclaimed after all replays are
complete.  And somehow inform the client when the cap is officially
nonexistent and EBADF.
---
 src/TODO                      |  2 +-
 src/include/ceph_fs.h         |  4 ++-
 src/mds/CInode.cc             | 15 ++++++++-
 src/mds/CInode.h              |  2 +-
 src/mds/MDCache.cc            | 62 ++++++++++++++++++++++-------------
 src/mds/MDCache.h             | 17 +++++++++-
 src/mds/Server.cc             | 11 ++++---
 src/mds/Server.h              |  2 +-
 src/messages/MClientRequest.h |  4 ++-
 9 files changed, 86 insertions(+), 33 deletions(-)

diff --git a/src/TODO b/src/TODO
index d743e5af34a26..87c1d4d4c33b6 100644
--- a/src/TODO
+++ b/src/TODO
@@ -48,7 +48,6 @@ repair
 
 
 kernel client
-- flush unsafe mds requests when closing mds sessions
 - optional or no fill_trace?
 - flock, fnctl locks
 - async xattrs
@@ -95,6 +94,7 @@ userspace client
 - fix readdir vs fragment race by keeping a separate frag pos, and ignoring dentries below it
 
 mds
+- take some care with replayed client requests vs new requests
 - linkage vs cdentry replicas and remote rename....
 - move root inode into stray dir
 - make recovery work with early replies
diff --git a/src/include/ceph_fs.h b/src/include/ceph_fs.h
index e5cd3b4a5433d..060d8d05dda5e 100644
--- a/src/include/ceph_fs.h
+++ b/src/include/ceph_fs.h
@@ -780,11 +780,13 @@ union ceph_mds_request_args {
 	} __attribute__ ((packed)) setlayout;
 } __attribute__ ((packed));
 
+#define CEPH_MDS_REQUEST_REPLAY 0xffff
+
 struct ceph_mds_request_head {
 	ceph_tid_t tid, oldest_client_tid;
 	ceph_epoch_t mdsmap_epoch; /* on client */
 	__le32 num_fwd;
-	__le32 retry_attempt;
+	__le32 retry_attempt;  /* REQUEST_REPLAY if replay */
 	__le64 mds_wants_replica_in_dirino;
 	__le32 op;
 	__le32 caller_uid, caller_gid;
diff --git a/src/mds/CInode.cc b/src/mds/CInode.cc
index 60a766377247e..d848812a84dcf 100644
--- a/src/mds/CInode.cc
+++ b/src/mds/CInode.cc
@@ -1343,7 +1343,7 @@ void CInode::decode_snap_blob(bufferlist& snapbl)
 
 
 bool CInode::encode_inodestat(bufferlist& bl, Session *session,
-			      snapid_t snapid)
+			      snapid_t snapid, bool is_replay)
 {
   int client = session->inst.name.num();
 
@@ -1442,6 +1442,19 @@ bool CInode::encode_inodestat(bufferlist& bl, Session *session,
       cap = add_client_cap(client, session, &mdcache->client_rdcaps, find_snaprealm());
     }
 
+    if (is_replay) {
+      // if this is a replayed request, check for a cap reconnect
+      ceph_mds_cap_reconnect *rc = mdcache->get_replay_cap_reconnect(pi->ino, client);
+      if (rc) {
+	// we should only have the cap reconnect for ONE client, and from ourselves.
+	dout(10) << " incorporating cap reconnect wanted " << ccap_string(rc->wanted)
+		 << " issue " << ccap_string(rc->issued) << " on " << *this << dendl;
+	cap->set_wanted(rc->wanted);
+	cap->issue(rc->issued);
+	mdcache->remove_replay_cap_reconnect(pi->ino, client);
+      }
+    }
+
     // if we're a directory, maybe bump filelock to loner?
     if (inode.is_dir() &&
 	is_auth() &&
diff --git a/src/mds/CInode.h b/src/mds/CInode.h
index 2ada3a01e198f..5033a37b8ca8b 100644
--- a/src/mds/CInode.h
+++ b/src/mds/CInode.h
@@ -439,7 +439,7 @@ private:
   
 
   // for giving to clients
-  bool encode_inodestat(bufferlist& bl, Session *session, snapid_t snapid=CEPH_NOSNAP);
+  bool encode_inodestat(bufferlist& bl, Session *session, snapid_t snapid=CEPH_NOSNAP, bool is_replay=false);
   void encode_cap_message(MClientCaps *m, Capability *cap);
 
 
diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
index fd6e620da8e29..fbd4ca6bc54a8 100644
--- a/src/mds/MDCache.cc
+++ b/src/mds/MDCache.cc
@@ -2879,7 +2879,7 @@ void MDCache::handle_cache_rejoin_weak(MMDSCacheRejoin *weak)
  * returns a C_Gather* is there is work to do.  caller is responsible for setting
  * the C_Gather completer.
  */
-C_Gather *MDCache::parallel_fetch(map<inodeno_t,filepath>& pathmap)
+C_Gather *MDCache::parallel_fetch(map<inodeno_t,filepath>& pathmap, set<inodeno_t>& missing)
 {
   dout(10) << "parallel_fetch on " << pathmap.size() << " paths" << dendl;
 
@@ -2898,8 +2898,16 @@ C_Gather *MDCache::parallel_fetch(map<inodeno_t,filepath>& pathmap)
     dout(17) << " missing " << p->first << " at " << p->second << dendl;
     CDir *dir = path_traverse_to_dir(p->second);
     assert(dir);
-    fetch_queue.insert(dir);
-    p++;
+    if (!dir->is_complete()) {
+      fetch_queue.insert(dir);
+      p++;
+    } else {
+      // probably because the client created it and held a cap but it never committed
+      // to the journal, and the op hasn't replayed yet.
+      dout(5) << " dne (not created yet?) " << p->first << " at " << p->second << dendl;
+      missing.insert(p->first);
+      pathmap.erase(p++);
+    }
   }
 
   if (pathmap.empty()) {
@@ -3441,20 +3449,43 @@ void MDCache::rejoin_gather_finish()
   //  do this before ack, since some inodes we may have already gotten
   //  from surviving MDSs.
   if (!cap_import_paths.empty()) {
-    C_Gather *gather = parallel_fetch(cap_import_paths);
+    C_Gather *gather = parallel_fetch(cap_import_paths, cap_imports_missing);
     if (gather) {
       gather->set_finisher(new C_MDC_RejoinGatherFinish(this));
       return;
     }
   }
   
+  process_imported_caps();
+  process_reconnected_caps();
+  identify_files_to_recover();
+
+  rejoin_send_acks();
+
+  // did we already get our acks too?
+  // this happens when the rejoin_gather has to wait on a MISSING/FULL exchange.
+  if (rejoin_ack_gather.empty()) {
+    mds->rejoin_done();
+
+    // finally, kickstart past snap parent opens
+    open_snap_parents();
+  }
+}
+
+void MDCache::process_imported_caps()
+{
   // process cap imports
   //  ino -> client -> frommds -> capex
-  for (map<inodeno_t,map<int, map<int,ceph_mds_cap_reconnect> > >::iterator p = cap_imports.begin();
-       p != cap_imports.end();
-       ++p) {
+  map<inodeno_t,map<int, map<int,ceph_mds_cap_reconnect> > >::iterator p = cap_imports.begin();
+  while (p != cap_imports.end()) {
     CInode *in = get_inode(p->first);
-    assert(in);
+    if (!in) {
+      dout(10) << "process_imported_caps still missing " << p->first
+	       << ", will try again after replayed client requests"
+	       << dendl;
+      p++;
+      continue;
+    }
     for (map<int, map<int,ceph_mds_cap_reconnect> >::iterator q = p->second.begin();
 	 q != p->second.end();
 	 ++q) 
@@ -3465,20 +3496,7 @@ void MDCache::rejoin_gather_finish()
 	if (r->first >= 0)
 	  rejoin_import_cap(in, q->first, r->second, r->first);
       }
-  }
-  
-  process_reconnected_caps();
-  identify_files_to_recover();
-
-  rejoin_send_acks();
-
-  // did we already get our acks too?
-  // this happens when the rejoin_gather has to wait on a MISSING/FULL exchange.
-  if (rejoin_ack_gather.empty()) {
-    mds->rejoin_done();
-
-    // finally, kickstart past snap parent opens
-    open_snap_parents();
+    cap_imports.erase(p++);  // remove and move on
   }
 }
 
diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h
index b09a4054f7930..fccc2e75e4e94 100644
--- a/src/mds/MDCache.h
+++ b/src/mds/MDCache.h
@@ -640,6 +640,7 @@ protected:
 
   map<inodeno_t,map<int,map<int,ceph_mds_cap_reconnect> > > cap_imports;  // ino -> client -> frommds -> capex
   map<inodeno_t,filepath> cap_import_paths;
+  set<inodeno_t> cap_imports_missing;
   
   set<CInode*> rejoin_undef_inodes;
   set<CInode*> rejoin_potential_updated_scatterlocks;
@@ -668,6 +669,19 @@ public:
     cap_imports[ino][client][frommds] = icr.capinfo;
     cap_import_paths[ino] = filepath(icr.path, (__u64)icr.capinfo.pathbase);
   }
+  ceph_mds_cap_reconnect *get_replay_cap_reconnect(inodeno_t ino, int client) {
+    if (cap_imports.count(ino) &&
+	cap_imports[ino].count(client) &&
+	cap_imports[ino][client].count(-1)) {
+      return &cap_imports[ino][client][-1];
+    }
+    return NULL;
+  }
+  void remove_replay_cap_reconnect(inodeno_t ino, int client) {
+    assert(cap_imports[ino].size() == 1);
+    assert(cap_imports[ino][client].size() == 1);
+    cap_imports.erase(ino);
+  }
 
   // [reconnect/rejoin caps]
   map<CInode*,map<int, inodeno_t> >  reconnected_caps;   // inode -> client -> realmino
@@ -679,6 +693,7 @@ public:
   void add_reconnected_snaprealm(int client, inodeno_t ino, snapid_t seq) {
     reconnected_snaprealms[ino][client] = seq;
   }
+  void process_imported_caps();
   void process_reconnected_caps();
   void prepare_realm_split(SnapRealm *realm, int client, inodeno_t ino,
 			   map<int,MClientSnap*>& splits);
@@ -868,7 +883,7 @@ public:
   void open_remote_dentry(CDentry *dn, bool projected, Context *fin);
   void _open_remote_dentry_finish(int r, CDentry *dn, bool projected, Context *fin);
 
-  C_Gather *parallel_fetch(map<inodeno_t,filepath>& pathmap);
+  C_Gather *parallel_fetch(map<inodeno_t,filepath>& pathmap, set<inodeno_t>& missing);
 
   void make_trace(vector<CDentry*>& trace, CInode *in);
   
diff --git a/src/mds/Server.cc b/src/mds/Server.cc
index d0d789c1d2510..c7953c7f1bdb5 100644
--- a/src/mds/Server.cc
+++ b/src/mds/Server.cc
@@ -570,7 +570,8 @@ void Server::early_reply(MDRequest *mdr, CInode *tracei, CDentry *tracedn)
   snapid_t snapid = CEPH_NOSNAP;
   CInode *snapdiri = 0;
   if (tracei || tracedn)
-    set_trace_dist(mdr->session, reply, tracei, tracedn, snapid, snapdiri, mdr);
+    set_trace_dist(mdr->session, reply, tracei, tracedn, snapid, snapdiri, mdr,
+		   mdr->client_request->is_replay());
 
   messenger->send_message(reply, client_inst);
 
@@ -636,6 +637,8 @@ void Server::reply_request(MDRequest *mdr, MClientReply *reply, CInode *tracei,
   // give any preallocated inos to the session
   apply_allocated_inos(mdr);
 
+  bool is_replay = mdr->client_request->is_replay();
+
   // clean up request, drop locks, etc.
   // do this before replying, so that we can issue leases
   Session *session = mdr->session;
@@ -652,7 +655,7 @@ void Server::reply_request(MDRequest *mdr, MClientReply *reply, CInode *tracei,
     // send reply, with trace, and possible leases
     if (!did_early_reply &&   // don't issue leases if we sent an earlier reply already
 	(tracei || tracedn)) 
-      set_trace_dist(session, reply, tracei, tracedn, snapid, snapdiri, mdr);
+      set_trace_dist(session, reply, tracei, tracedn, snapid, snapdiri, mdr, is_replay);
     messenger->send_message(reply, client_inst);
   }
   
@@ -698,7 +701,7 @@ void Server::encode_null_lease(bufferlist& bl)
  */
 void Server::set_trace_dist(Session *session, MClientReply *reply, CInode *in, CDentry *dn,
 			    snapid_t snapid, CInode *snapdiri,
-			    MDRequest *mdr)
+			    MDRequest *mdr, bool is_replay)
 {
   // inode, dentry, dir, ..., inode
   bufferlist bl;
@@ -731,7 +734,7 @@ void Server::set_trace_dist(Session *session, MClientReply *reply, CInode *in, C
     dout(10) << "set_trace_dist snaprealm " << *realm << dendl;
   }
 
-  in->encode_inodestat(bl, session, snapid);
+  in->encode_inodestat(bl, session, snapid, is_replay);
   dout(20) << "set_trace_dist added snapid " << snapid << " " << *in << dendl;
 
   if (snapid != CEPH_NOSNAP && in == snapdiri) {
diff --git a/src/mds/Server.h b/src/mds/Server.h
index 77b71d0746dc7..7f7715ff83f7f 100644
--- a/src/mds/Server.h
+++ b/src/mds/Server.h
@@ -90,7 +90,7 @@ public:
   void reply_request(MDRequest *mdr, MClientReply *reply, CInode *tracei = 0, CDentry *tracedn = 0);
   void set_trace_dist(Session *session, MClientReply *reply, CInode *in, CDentry *dn,
 		      snapid_t snapid, CInode *snapdiri,
-		      MDRequest *mdr = 0);
+		      MDRequest *mdr, bool is_replay);
 
   void encode_empty_dirstat(bufferlist& bl);
   void encode_infinite_lease(bufferlist& bl);
diff --git a/src/messages/MClientRequest.h b/src/messages/MClientRequest.h
index 20945435215ef..58e39851b4de6 100644
--- a/src/messages/MClientRequest.h
+++ b/src/messages/MClientRequest.h
@@ -100,7 +100,9 @@ public:
     return head.op & CEPH_MDS_OP_FOLLOW_LINK;
   }
 
-
+  bool is_replay() {
+    return head.retry_attempt == CEPH_MDS_REQUEST_REPLAY;
+  }
 
   // normal fields
   void set_tid(tid_t t) { head.tid = t; }
-- 
2.39.5