From 23fc86d10bd629018a7921cd6c3beac27cb23ec2 Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Thu, 24 Jun 2021 14:41:10 +0800 Subject: [PATCH] mds: just respawn mds daemon when osd op requests timeout Fixes: https://tracker.ceph.com/issues/51280 Signed-off-by: Xiubo Li (cherry picked from commit c854a4eea44a631079dfe481c235a323fae54b74) --- src/mds/MDSContext.cc | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/mds/MDSContext.cc b/src/mds/MDSContext.cc index 8c04586f212e..fcf6f764fe4c 100644 --- a/src/mds/MDSContext.cc +++ b/src/mds/MDSContext.cc @@ -107,8 +107,11 @@ void MDSIOContextBase::complete(int r) { return; } - if (r == -CEPHFS_EBLOCKLISTED) { - derr << "MDSIOContextBase: blocklisted! Restarting..." << dendl; + // It's possible that the osd op requests will be stuck and then times out + // after "rados_osd_op_timeout", the mds won't know what we should it, just + // respawn it. + if (r == -CEPHFS_EBLOCKLISTED || r == -CEPHFS_ETIMEDOUT) { + derr << "MDSIOContextBase: failed with " << r << ", restarting..." << dendl; mds->respawn(); } else { MDSContext::complete(r); -- 2.47.3