From 101ae53a6c333058cd8f6f4d169d29457133b446 Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Wed, 3 Nov 2021 14:27:02 +0800 Subject: [PATCH] mds: just respawn mds daemon when osd op requests timeout Fixes: https://tracker.ceph.com/issues/51280 Signed-off-by: Xiubo Li --- src/mds/MDSContext.cc | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/mds/MDSContext.cc b/src/mds/MDSContext.cc index 94726a2cb4f45..f96fc54202a78 100644 --- a/src/mds/MDSContext.cc +++ b/src/mds/MDSContext.cc @@ -107,8 +107,11 @@ void MDSIOContextBase::complete(int r) { return; } - if (r == -EBLACKLISTED) { - derr << "MDSIOContextBase: blacklisted! Restarting..." << dendl; + // It's possible that the osd op requests will be stuck and then times out + // after "rados_osd_op_timeout", the mds won't know what we should it, just + // respawn it. + if (r == -EBLACKLISTED || r == -ETIMEDOUT) { + derr << "MDSIOContextBase: failed with " << r << ", restarting..." << dendl; mds->respawn(); } else { MDSContext::complete(r); -- 2.39.5