In multi-mds cluster, recovering mds may receive mdsmap that changes
its state after other mds. Furthermore, the recovering mds may receive
messages tiggered by its state change from other mds before it receive
corresponding mdsmap.
Fixes: http://tracker.ceph.com/issues/37594
Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
(cherry picked from commit
d3a444473abc98e5ce8121af24538a141a292777)
Conflicts:
src/mds/Locker.cc
src/mds/MDCache.cc
src/mds/MDSRank.h
void Locker::handle_inode_file_caps(MInodeFileCaps *m)
{
// nobody should be talking to us during recovery.
- assert(mds->is_clientreplay() || mds->is_active() || mds->is_stopping());
+ if (mds->get_state() < MDSMap::STATE_CLIENTREPLAY) {
+ if (mds->get_want_state() >= MDSMap::STATE_CLIENTREPLAY) {
+ mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
+ return;
+ }
+ assert(!"got unexpected message during recovery");
+ }
// ok
CInode *in = mdcache->get_inode(m->get_ino());
mds_rank_t from = mds_rank_t(strong->get_source().num());
// only a recovering node will get a strong rejoin.
- assert(mds->is_rejoin());
+ if (!mds->is_rejoin()) {
+ if (mds->get_want_state() == MDSMap::STATE_REJOIN) {
+ mds->wait_for_rejoin(new C_MDS_RetryMessage(mds, strong));
+ return;
+ }
+ assert(!"got unexpected rejoin message during recovery");
+ }
// assimilate any potentially dirty scatterlock state
for (map<inodeno_t,MMDSCacheRejoin::lock_bls>::iterator p = strong->inode_scatterlocks.begin();
{
dout(1) << "rejoin_start" << dendl;
mdcache->rejoin_start(new C_MDS_VoidFn(this, &MDSRank::rejoin_done));
+ finish_contexts(g_ceph_context, waiting_for_rejoin);
}
void MDSRank::rejoin_done()
{
ceph_tid_t last_tid; // for mds-initiated requests (e.g. stray rename)
- list<MDSInternalContextBase*> waiting_for_active, waiting_for_replay, waiting_for_reconnect, waiting_for_resolve;
+ list<MDSInternalContextBase*> waiting_for_active, waiting_for_replay, waiting_for_rejoin,
+ waiting_for_reconnect, waiting_for_resolve;
list<MDSInternalContextBase*> waiting_for_any_client_connection;
list<MDSInternalContextBase*> replay_queue;
map<mds_rank_t, list<MDSInternalContextBase*> > waiting_for_active_peer;
void wait_for_replay(MDSInternalContextBase *c) {
waiting_for_replay.push_back(c);
}
+ void wait_for_rejoin(MDSInternalContextBase *c) {
+ waiting_for_rejoin.push_back(c);
+ }
void wait_for_reconnect(MDSInternalContextBase *c) {
waiting_for_reconnect.push_back(c);
}