From 3b5da9c613906a66db03550aa58ce4c7c27a73f0 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Tue, 9 Jan 2018 19:23:25 +0800 Subject: [PATCH] mds: synchronize snaptable caches when mds recovers The basic idea is: 1. For recovering mds: Learn other mds' pending snaptable commits from resolve messages. Load snaptable cache from snapserver when resolve done. 2. For survivor mds: Refresh snaptable cache from snapserver when cluster is in resolving state. Learn recovering mds' pending snaptable commits from resolve messages. Signed-off-by: "Yan, Zheng" --- src/mds/MDCache.cc | 36 +++++++++++++++++++++++++++++++++++- src/mds/MDCache.h | 1 + src/mds/MDSTableClient.cc | 9 +++++++++ src/mds/MDSTableClient.h | 7 +++++++ src/messages/MMDSResolve.h | 31 ++++++++++++++++++++++++++++++- 5 files changed, 82 insertions(+), 2 deletions(-) diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc index 772828d0454b3..95ae7379e662b 100644 --- a/src/mds/MDCache.cc +++ b/src/mds/MDCache.cc @@ -2670,11 +2670,26 @@ void MDCache::resolve_start(MDSInternalContext *resolve_done_) adjust_subtree_auth(rootdir, CDIR_AUTH_UNKNOWN); } resolve_gather = recovery_set; + + resolve_snapclient_commits = mds->snapclient->get_journaled_tids(); } void MDCache::send_resolves() { send_slave_resolves(); + + if (!resolve_done) { + // I'm survivor: refresh snap cache + mds->snapclient->sync( + new MDSInternalContextWrapper(mds, + new FunctionContext([this](int r) { + maybe_finish_slave_resolve(); + }) + ) + ); + dout(10) << "send_resolves waiting for snapclient cache to sync" << dendl; + return; + } if (!resolve_ack_gather.empty()) { dout(10) << "send_resolves still waiting for resolve ack from (" << resolve_ack_gather << ")" << dendl; @@ -2685,6 +2700,7 @@ void MDCache::send_resolves() << resolve_need_rollback << ")" << dendl; return; } + send_subtree_resolves(); } @@ -2848,6 +2864,11 @@ void MDCache::send_subtree_resolves() p != resolves.end(); ++p) { MMDSResolve* m = p->second; + if (mds->is_resolve()) { + m->add_table_commits(TABLE_SNAP, resolve_snapclient_commits); + } else { + m->add_table_commits(TABLE_SNAP, mds->snapclient->get_journaled_tids()); + } m->subtrees = my_subtrees; m->ambiguous_imports = my_ambig_imports; dout(10) << "sending subtee resolve to mds." << p->first << dendl; @@ -2858,7 +2879,9 @@ void MDCache::send_subtree_resolves() void MDCache::maybe_finish_slave_resolve() { if (resolve_ack_gather.empty() && resolve_need_rollback.empty()) { - send_subtree_resolves(); + // snap cache get synced or I'm in resolve state + if (mds->snapclient->is_synced() || resolve_done) + send_subtree_resolves(); process_delayed_resolve(); } } @@ -3281,6 +3304,16 @@ void MDCache::handle_resolve(MMDSResolve *m) dout(10) << "noting ambiguous import on " << pi->first << " bounds " << pi->second << dendl; other_ambiguous_imports[from][pi->first].swap( pi->second ); } + + // learn other mds' pendina snaptable commits. later when resolve finishes, we will reload + // snaptable cache from snapserver. By this way, snaptable cache get synced among all mds + for (auto p : m->table_clients) { + dout(10) << " noting " << get_mdstable_name(p.type) + << " pending_commits " << p.pending_commits << dendl; + MDSTableClient *client = mds->get_table_client(p.type); + for (auto q : p.pending_commits) + client->notify_commit(q); + } // did i get them all? resolve_gather.erase(from); @@ -3328,6 +3361,7 @@ void MDCache::maybe_resolve_finish() recalc_auth_bits(false); resolve_done.release()->complete(0); } else { + // I am survivor. maybe_send_pending_rejoins(); } } diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h index 6e8a84ac74f91..f3cc5bae718cf 100644 --- a/src/mds/MDCache.h +++ b/src/mds/MDCache.h @@ -470,6 +470,7 @@ protected: bool resolves_pending; set resolve_gather; // nodes i need resolves from set resolve_ack_gather; // nodes i need a resolve_ack from + set resolve_snapclient_commits; map resolve_need_rollback; // rollbacks i'm writing to the journal map delayed_resolve; diff --git a/src/mds/MDSTableClient.cc b/src/mds/MDSTableClient.cc index 61b901e2f7f5e..b2277bb9b752c 100644 --- a/src/mds/MDSTableClient.cc +++ b/src/mds/MDSTableClient.cc @@ -53,6 +53,15 @@ void MDSTableClient::handle_request(class MMDSTableRequest *m) dout(10) << "handle_request " << *m << dendl; assert(m->table == table); + if (mds->get_state() < MDSMap::STATE_RESOLVE) { + if (mds->get_want_state() == CEPH_MDS_STATE_RESOLVE) { + mds->wait_for_resolve(new C_MDS_RetryMessage(mds, m)); + } else { + m->put(); + } + return; + } + version_t tid = m->get_tid(); uint64_t reqid = m->reqid; diff --git a/src/mds/MDSTableClient.h b/src/mds/MDSTableClient.h index ac4e66d072dd6..3eb68c421ae0d 100644 --- a/src/mds/MDSTableClient.h +++ b/src/mds/MDSTableClient.h @@ -80,6 +80,13 @@ public: ack_waiters[tid].push_back(c); } + set get_journaled_tids() const { + set tids; + for (auto p : pending_commit) + tids.insert(p.first); + return tids; + } + void handle_mds_failure(mds_rank_t mds); // child must implement diff --git a/src/messages/MMDSResolve.h b/src/messages/MMDSResolve.h index bf87e5f3a8870..dc072c640b29a 100644 --- a/src/messages/MMDSResolve.h +++ b/src/messages/MMDSResolve.h @@ -39,10 +39,32 @@ public: decode(committing, bl); } }; - WRITE_CLASS_ENCODER(slave_request) map slave_requests; + // table client information + struct table_client { + __u8 type; + set pending_commits; + + table_client() : type(0) {} + table_client(int _type, const set& commits) + : type(_type), pending_commits(commits) {} + + void encode(bufferlist& bl) const { + using ceph::encode; + encode(type, bl); + encode(pending_commits, bl); + } + void decode(bufferlist::iterator& bl) { + using ceph::decode; + decode(type, bl); + decode(pending_commits, bl); + } + }; + + list table_clients; + MMDSResolve() : Message(MSG_MDS_RESOLVE) {} private: ~MMDSResolve() override {} @@ -75,11 +97,16 @@ public: slave_requests[reqid].inode_caps.claim(bl); } + void add_table_commits(int table, const set& pending_commits) { + table_clients.push_back(table_client(table, pending_commits)); + } + void encode_payload(uint64_t features) override { using ceph::encode; encode(subtrees, payload); encode(ambiguous_imports, payload); encode(slave_requests, payload); + encode(table_clients, payload); } void decode_payload() override { using ceph::decode; @@ -87,6 +114,7 @@ public: decode(subtrees, p); decode(ambiguous_imports, p); decode(slave_requests, p); + decode(table_clients, p); } }; @@ -95,4 +123,5 @@ inline ostream& operator<<(ostream& out, const MMDSResolve::slave_request&) { } WRITE_CLASS_ENCODER(MMDSResolve::slave_request) +WRITE_CLASS_ENCODER(MMDSResolve::table_client) #endif -- 2.39.5