]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
mds: synchronize snaptable caches when mds recovers
authorYan, Zheng <zyan@redhat.com>
Tue, 9 Jan 2018 11:23:25 +0000 (19:23 +0800)
committerYan, Zheng <zyan@redhat.com>
Fri, 9 Feb 2018 09:46:55 +0000 (17:46 +0800)
The basic idea is:

1. For recovering mds:
 Learn other mds' pending snaptable commits from resolve messages.
 Load snaptable cache from snapserver when resolve done.

2. For survivor mds:
  Refresh snaptable cache from snapserver when cluster is in resolving
  state.
  Learn recovering mds' pending snaptable commits from resolve messages.

Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
src/mds/MDCache.cc
src/mds/MDCache.h
src/mds/MDSTableClient.cc
src/mds/MDSTableClient.h
src/messages/MMDSResolve.h

index 772828d0454b3445e1236cfe132974f6dab16000..95ae7379e662b8ea5055bdefdf5d7fcb18b6fbc4 100644 (file)
@@ -2670,11 +2670,26 @@ void MDCache::resolve_start(MDSInternalContext *resolve_done_)
       adjust_subtree_auth(rootdir, CDIR_AUTH_UNKNOWN);
   }
   resolve_gather = recovery_set;
+
+  resolve_snapclient_commits = mds->snapclient->get_journaled_tids();
 }
 
 void MDCache::send_resolves()
 {
   send_slave_resolves();
+
+  if (!resolve_done) {
+    // I'm survivor: refresh snap cache
+    mds->snapclient->sync(
+       new MDSInternalContextWrapper(mds,
+         new FunctionContext([this](int r) {
+           maybe_finish_slave_resolve();
+           })
+         )
+       );
+    dout(10) << "send_resolves waiting for snapclient cache to sync" << dendl;
+    return;
+  }
   if (!resolve_ack_gather.empty()) {
     dout(10) << "send_resolves still waiting for resolve ack from ("
             << resolve_ack_gather << ")" << dendl;
@@ -2685,6 +2700,7 @@ void MDCache::send_resolves()
             << resolve_need_rollback << ")" << dendl;
     return;
   }
+
   send_subtree_resolves();
 }
 
@@ -2848,6 +2864,11 @@ void MDCache::send_subtree_resolves()
        p != resolves.end();
        ++p) {
     MMDSResolve* m = p->second;
+    if (mds->is_resolve()) {
+      m->add_table_commits(TABLE_SNAP, resolve_snapclient_commits);
+    } else {
+      m->add_table_commits(TABLE_SNAP, mds->snapclient->get_journaled_tids());
+    }
     m->subtrees = my_subtrees;
     m->ambiguous_imports = my_ambig_imports;
     dout(10) << "sending subtee resolve to mds." << p->first << dendl;
@@ -2858,7 +2879,9 @@ void MDCache::send_subtree_resolves()
 
 void MDCache::maybe_finish_slave_resolve() {
   if (resolve_ack_gather.empty() && resolve_need_rollback.empty()) {
-    send_subtree_resolves();
+    // snap cache get synced or I'm in resolve state
+    if (mds->snapclient->is_synced() || resolve_done)
+      send_subtree_resolves();
     process_delayed_resolve();
   }
 }
@@ -3281,6 +3304,16 @@ void MDCache::handle_resolve(MMDSResolve *m)
     dout(10) << "noting ambiguous import on " << pi->first << " bounds " << pi->second << dendl;
     other_ambiguous_imports[from][pi->first].swap( pi->second );
   }
+
+  // learn other mds' pendina snaptable commits. later when resolve finishes, we will reload
+  // snaptable cache from snapserver. By this way, snaptable cache get synced among all mds
+  for (auto p : m->table_clients) {
+    dout(10) << " noting " << get_mdstable_name(p.type)
+            << " pending_commits " << p.pending_commits << dendl;
+    MDSTableClient *client = mds->get_table_client(p.type);
+    for (auto q : p.pending_commits)
+      client->notify_commit(q);
+  }
   
   // did i get them all?
   resolve_gather.erase(from);
@@ -3328,6 +3361,7 @@ void MDCache::maybe_resolve_finish()
     recalc_auth_bits(false);
     resolve_done.release()->complete(0);
   } else {
+    // I am survivor.
     maybe_send_pending_rejoins();
   }
 }
index 6e8a84ac74f918e791971247eed45e27386326b0..f3cc5bae718cf7f91d51b6579c5b85fb8260744f 100644 (file)
@@ -470,6 +470,7 @@ protected:
   bool resolves_pending;
   set<mds_rank_t> resolve_gather;      // nodes i need resolves from
   set<mds_rank_t> resolve_ack_gather;  // nodes i need a resolve_ack from
+  set<version_t> resolve_snapclient_commits;
   map<metareqid_t, mds_rank_t> resolve_need_rollback;  // rollbacks i'm writing to the journal
   map<mds_rank_t, MMDSResolve*> delayed_resolve;
   
index 61b901e2f7f5e221d915f26c89285ee3978f4463..b2277bb9b752cb29cc16764dbeeaa24de0f945a1 100644 (file)
@@ -53,6 +53,15 @@ void MDSTableClient::handle_request(class MMDSTableRequest *m)
   dout(10) << "handle_request " << *m << dendl;
   assert(m->table == table);
 
+  if (mds->get_state() < MDSMap::STATE_RESOLVE) {
+    if (mds->get_want_state() == CEPH_MDS_STATE_RESOLVE) {
+      mds->wait_for_resolve(new C_MDS_RetryMessage(mds, m));
+    } else {
+      m->put();
+    }
+    return;
+  }
+
   version_t tid = m->get_tid();
   uint64_t reqid = m->reqid;
 
index ac4e66d072dd6735388a5f8e884861012282c973..3eb68c421ae0d64c1a474cedb412234131b44829 100644 (file)
@@ -80,6 +80,13 @@ public:
     ack_waiters[tid].push_back(c);
   }
 
+  set<version_t> get_journaled_tids() const {
+    set<version_t> tids;
+    for (auto p : pending_commit)
+      tids.insert(p.first);
+    return tids;
+  }
+
   void handle_mds_failure(mds_rank_t mds);
 
   // child must implement
index bf87e5f3a88708f78ee493729c8d12602db34f74..dc072c640b29ab2eeb1b84d45574c1dd7092c525 100644 (file)
@@ -39,10 +39,32 @@ public:
       decode(committing, bl);
     }
   };
-  WRITE_CLASS_ENCODER(slave_request)
 
   map<metareqid_t, slave_request> slave_requests;
 
+  // table client information
+  struct table_client {
+    __u8 type;
+    set<version_t> pending_commits;
+
+    table_client() : type(0) {}
+    table_client(int _type, const set<version_t>& commits)
+      : type(_type), pending_commits(commits) {}
+
+    void encode(bufferlist& bl) const {
+      using ceph::encode;
+      encode(type, bl);
+      encode(pending_commits, bl);
+    }
+    void decode(bufferlist::iterator& bl) {
+      using ceph::decode;
+      decode(type, bl);
+      decode(pending_commits, bl);
+    }
+  };
+
+  list<table_client> table_clients;
+
   MMDSResolve() : Message(MSG_MDS_RESOLVE) {}
 private:
   ~MMDSResolve() override {}
@@ -75,11 +97,16 @@ public:
     slave_requests[reqid].inode_caps.claim(bl);
   }
 
+  void add_table_commits(int table, const set<version_t>& pending_commits) {
+    table_clients.push_back(table_client(table, pending_commits));
+  }
+
   void encode_payload(uint64_t features) override {
     using ceph::encode;
     encode(subtrees, payload);
     encode(ambiguous_imports, payload);
     encode(slave_requests, payload);
+    encode(table_clients, payload);
   }
   void decode_payload() override {
     using ceph::decode;
@@ -87,6 +114,7 @@ public:
     decode(subtrees, p);
     decode(ambiguous_imports, p);
     decode(slave_requests, p);
+    decode(table_clients, p);
   }
 };
 
@@ -95,4 +123,5 @@ inline ostream& operator<<(ostream& out, const MMDSResolve::slave_request&) {
 }
 
 WRITE_CLASS_ENCODER(MMDSResolve::slave_request)
+WRITE_CLASS_ENCODER(MMDSResolve::table_client)
 #endif