From e0a52e03e949c54482899f4a507a0870d25e0f27 Mon Sep 17 00:00:00 2001 From: Venky Shankar Date: Tue, 28 Aug 2018 06:35:18 -0400 Subject: [PATCH] client: retry remount on dcache invalidation failure For some (unknown) reason, there have been reports of ceph-fuse crash due to failure is remounting at the time of invalidating kernel dentry cache. This issue is not also reproducible yet. Therefore, as suggested by Patrick and Zheng, for a temporary workaround, client would ignore the failure as the invalidation would be retried again. There is a max cap on the number of consecutive remount failures after which client would abort. Fixes: http://tracker.ceph.com/issues/35931 Signed-off-by: Venky Shankar (cherry picked from commit d1471f070cd1ad9c0f773e00d2552161d1ad5955) Conflicts: src/client/Client.cc src/client/Client.h src/common/options.cc --- src/client/Client.cc | 18 ++++++++++++------ src/client/Client.h | 5 ++++- src/common/options.cc | 3 +++ 3 files changed, 19 insertions(+), 7 deletions(-) diff --git a/src/client/Client.cc b/src/client/Client.cc index aacc68f1fd338..20009ea055a1f 100644 --- a/src/client/Client.cc +++ b/src/client/Client.cc @@ -4049,11 +4049,15 @@ void Client::remove_session_caps(MetaSession *s) sync_cond.Signal(); } -int Client::_do_remount(void) +int Client::_do_remount(bool retry_on_error) { + uint64_t max_retries = cct->_conf->get_val("mds_max_retries_on_remount_failure"); + errno = 0; int r = remount_cb(callback_handle); - if (r != 0) { + if (r == 0) { + retries_on_invalidate = 0; + } else { int e = errno; client_t whoami = get_nodeid(); if (r == -1) { @@ -4065,8 +4069,10 @@ int Client::_do_remount(void) "failed to remount (to trim kernel dentries): " "return code = " << r << dendl; } - bool should_abort = cct->_conf->get_val("client_die_on_failed_remount") || - cct->_conf->get_val("client_die_on_failed_dentry_invalidate"); + bool should_abort = + (cct->_conf->get_val("client_die_on_failed_remount") || + cct->_conf->get_val("client_die_on_failed_dentry_invalidate")) && + !(retry_on_error && (++retries_on_invalidate < max_retries)); if (should_abort && !unmounting) { lderr(cct) << "failed to remount for kernel dentry trimming; quitting!" << dendl; ceph_abort(); @@ -4082,7 +4088,7 @@ public: explicit C_Client_Remount(Client *c) : client(c) {} void finish(int r) override { assert(r == 0); - client->_do_remount(); + client->_do_remount(true); } }; @@ -10113,7 +10119,7 @@ int Client::test_dentry_handling(bool can_invalidate) r = 0; } else if (remount_cb) { ldout(cct, 1) << "using remount_cb" << dendl; - r = _do_remount(); + r = _do_remount(false); } if (r) { bool should_abort = cct->_conf->get_val("client_die_on_failed_dentry_invalidate"); diff --git a/src/client/Client.h b/src/client/Client.h index 2616f6d716919..99707483821b0 100644 --- a/src/client/Client.h +++ b/src/client/Client.h @@ -764,7 +764,7 @@ private: int _release_fh(Fh *fh); void _put_fh(Fh *fh); - int _do_remount(void); + int _do_remount(bool retry_on_error); friend class C_Client_Remount; struct C_Readahead : public Context { @@ -1253,6 +1253,9 @@ public: uint32_t get_deleg_timeout() { return deleg_timeout; } int set_deleg_timeout(uint32_t timeout); int ll_delegation(Fh *fh, unsigned cmd, ceph_deleg_cb_t cb, void *priv); + +private: + uint64_t retries_on_invalidate = 0; }; /** diff --git a/src/common/options.cc b/src/common/options.cc index c45f900154f78..c6b421dc9a7e1 100644 --- a/src/common/options.cc +++ b/src/common/options.cc @@ -6493,6 +6493,9 @@ std::vector