From: Venky Shankar Date: Tue, 28 Aug 2018 10:35:18 +0000 (-0400) Subject: client: retry remount on dcache invalidation failure X-Git-Tag: v13.2.5~144^2~3^2~1 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=2b9f5ccf523e44fed77ecab4b1147b91c7d0d245;p=ceph.git client: retry remount on dcache invalidation failure For some (unknown) reason, there have been reports of ceph-fuse crash due to failure is remounting at the time of invalidating kernel dentry cache. This issue is not also reproducible yet. Therefore, as suggested by Patrick and Zheng, for a temporary workaround, client would ignore the failure as the invalidation would be retried again. There is a max cap on the number of consecutive remount failures after which client would abort. Signed-off-by: Venky Shankar (cherry picked from commit d1471f070cd1ad9c0f773e00d2552161d1ad5955) Conflicts: src/client/Client.cc src/client/Client.h src/common/options.cc --- diff --git a/src/client/Client.cc b/src/client/Client.cc index 1380eb5646ee..3ef1c4b2d595 100644 --- a/src/client/Client.cc +++ b/src/client/Client.cc @@ -4047,11 +4047,15 @@ void Client::remove_session_caps(MetaSession *s) sync_cond.Signal(); } -int Client::_do_remount(void) +int Client::_do_remount(bool retry_on_error) { + uint64_t max_retries = cct->_conf->get_val("mds_max_retries_on_remount_failure"); + errno = 0; int r = remount_cb(callback_handle); - if (r != 0) { + if (r == 0) { + retries_on_invalidate = 0; + } else { int e = errno; client_t whoami = get_nodeid(); if (r == -1) { @@ -4063,8 +4067,10 @@ int Client::_do_remount(void) "failed to remount (to trim kernel dentries): " "return code = " << r << dendl; } - bool should_abort = cct->_conf->get_val("client_die_on_failed_remount") || - cct->_conf->get_val("client_die_on_failed_dentry_invalidate"); + bool should_abort = + (cct->_conf->get_val("client_die_on_failed_remount") || + cct->_conf->get_val("client_die_on_failed_dentry_invalidate")) && + !(retry_on_error && (++retries_on_invalidate < max_retries)); if (should_abort && !unmounting) { lderr(cct) << "failed to remount for kernel dentry trimming; quitting!" << dendl; ceph_abort(); @@ -4080,7 +4086,7 @@ public: explicit C_Client_Remount(Client *c) : client(c) {} void finish(int r) override { assert(r == 0); - client->_do_remount(); + client->_do_remount(true); } }; @@ -10116,7 +10122,7 @@ int Client::test_dentry_handling(bool can_invalidate) r = 0; } else if (remount_cb) { ldout(cct, 1) << "using remount_cb" << dendl; - r = _do_remount(); + r = _do_remount(false); } if (r) { bool should_abort = cct->_conf->get_val("client_die_on_failed_dentry_invalidate"); diff --git a/src/client/Client.h b/src/client/Client.h index 6350242fab04..9c75a1c4590c 100644 --- a/src/client/Client.h +++ b/src/client/Client.h @@ -762,7 +762,7 @@ private: int _release_fh(Fh *fh); void _put_fh(Fh *fh); - int _do_remount(void); + int _do_remount(bool retry_on_error); struct C_Readahead : public Context { Client *client; @@ -1255,6 +1255,9 @@ public: uint32_t get_deleg_timeout() { return deleg_timeout; } int set_deleg_timeout(uint32_t timeout); int ll_delegation(Fh *fh, unsigned cmd, ceph_deleg_cb_t cb, void *priv); + +private: + uint64_t retries_on_invalidate = 0; }; /** diff --git a/src/common/options.cc b/src/common/options.cc index d58ecb168b22..98dad74817e9 100644 --- a/src/common/options.cc +++ b/src/common/options.cc @@ -7031,6 +7031,10 @@ std::vector