From: Xiubo Li Date: Wed, 3 Nov 2021 06:27:02 +0000 (+0800) Subject: mds: just respawn mds daemon when osd op requests timeout X-Git-Tag: v15.2.16~37^2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=101ae53a6c333058cd8f6f4d169d29457133b446;p=ceph.git mds: just respawn mds daemon when osd op requests timeout Fixes: https://tracker.ceph.com/issues/51280 Signed-off-by: Xiubo Li --- diff --git a/src/mds/MDSContext.cc b/src/mds/MDSContext.cc index 94726a2cb4f..f96fc54202a 100644 --- a/src/mds/MDSContext.cc +++ b/src/mds/MDSContext.cc @@ -107,8 +107,11 @@ void MDSIOContextBase::complete(int r) { return; } - if (r == -EBLACKLISTED) { - derr << "MDSIOContextBase: blacklisted! Restarting..." << dendl; + // It's possible that the osd op requests will be stuck and then times out + // after "rados_osd_op_timeout", the mds won't know what we should it, just + // respawn it. + if (r == -EBLACKLISTED || r == -ETIMEDOUT) { + derr << "MDSIOContextBase: failed with " << r << ", restarting..." << dendl; mds->respawn(); } else { MDSContext::complete(r);