From: Venky Shankar Date: Wed, 2 Aug 2023 05:39:00 +0000 (-0400) Subject: cephfs-mirror: restart failed/blocklisted replayer instances X-Git-Tag: v18.2.1~229^2 X-Git-Url: http://git.apps.os.sepia.ceph.com/?a=commitdiff_plain;h=f54125677d9c8490f8d9104ca6e153cbe3187666;p=ceph-ci.git cephfs-mirror: restart failed/blocklisted replayer instances This was buggy right from the start. Start maintaining per replayer blocklisted or failed timestamp and use that to check if a replayer restart is required. Signed-off-by: Venky Shankar (cherry picked from commit f2fb84ddd25ac66ca1e79339801913b35b597e83) --- diff --git a/src/tools/cephfs_mirror/FSMirror.h b/src/tools/cephfs_mirror/FSMirror.h index d2c9c07782a..a9c1fab1025 100644 --- a/src/tools/cephfs_mirror/FSMirror.h +++ b/src/tools/cephfs_mirror/FSMirror.h @@ -52,11 +52,35 @@ public: m_mirror_watcher->is_failed(); } + utime_t get_failed_ts() { + std::scoped_lock locker(m_lock); + if (m_instance_watcher) { + return m_instance_watcher->get_failed_ts(); + } + if (m_mirror_watcher) { + return m_mirror_watcher->get_failed_ts(); + } + + return utime_t(); + } + bool is_blocklisted() { std::scoped_lock locker(m_lock); return is_blocklisted(locker); } + utime_t get_blocklisted_ts() { + std::scoped_lock locker(m_lock); + if (m_instance_watcher) { + return m_instance_watcher->get_blocklisted_ts(); + } + if (m_mirror_watcher) { + return m_mirror_watcher->get_blocklisted_ts(); + } + + return utime_t(); + } + Peers get_peers() { std::scoped_lock locker(m_lock); return m_all_peers; diff --git a/src/tools/cephfs_mirror/InstanceWatcher.cc b/src/tools/cephfs_mirror/InstanceWatcher.cc index aeb474274fb..b6a51a141aa 100644 --- a/src/tools/cephfs_mirror/InstanceWatcher.cc +++ b/src/tools/cephfs_mirror/InstanceWatcher.cc @@ -116,12 +116,15 @@ void InstanceWatcher::handle_rewatch_complete(int r) { dout(0) << ": client blocklisted" < &args, m_monc(monc), m_msgr(msgr), m_listener(this), - m_last_blocklist_check(ceph_clock_now()), - m_last_failure_check(ceph_clock_now()), m_local(new librados::Rados()) { auto thread_pool = &(cct->lookup_or_create_singleton_object( "cephfs::mirror::thread_pool", false, cct)); @@ -498,51 +496,35 @@ void Mirror::update_fs_mirrors() { auto now = ceph_clock_now(); double blocklist_interval = g_ceph_context->_conf.get_val ("cephfs_mirror_restart_mirror_on_blocklist_interval").count(); - bool check_blocklist = blocklist_interval > 0 && ((now - m_last_blocklist_check) >= blocklist_interval); - double failed_interval = g_ceph_context->_conf.get_val ("cephfs_mirror_restart_mirror_on_failure_interval").count(); - bool check_failure = failed_interval > 0 && ((now - m_last_failure_check) >= failed_interval); { std::scoped_lock locker(m_lock); for (auto &[filesystem, mirror_action] : m_mirror_actions) { - auto failed = mirror_action.fs_mirror && mirror_action.fs_mirror->is_failed(); - auto blocklisted = mirror_action.fs_mirror && mirror_action.fs_mirror->is_blocklisted(); - - if (check_failure && !mirror_action.action_in_progress && - !_is_restarting(filesystem) && failed) { - // about to restart failed mirror instance -- nothing - // should interfere - dout(5) << ": filesystem=" << filesystem << " failed mirroring -- restarting" << dendl; - _set_restarting(filesystem); - auto peers = mirror_action.fs_mirror->get_peers(); - auto ctx = new C_RestartMirroring(this, filesystem, mirror_action.pool_id, peers); - ctx->complete(0); - } else if (check_blocklist && !mirror_action.action_in_progress && - !_is_restarting(filesystem) && blocklisted) { - // about to restart blocklisted mirror instance -- nothing - // should interfere - _set_restarting(filesystem); - dout(5) << ": filesystem=" << filesystem << " is blocklisted -- restarting" << dendl; - auto peers = mirror_action.fs_mirror->get_peers(); - auto ctx = new C_RestartMirroring(this, filesystem, mirror_action.pool_id, peers); - ctx->complete(0); + auto failed_restart = mirror_action.fs_mirror && mirror_action.fs_mirror->is_failed() && + (failed_interval > 0 && (mirror_action.fs_mirror->get_failed_ts() - now) > failed_interval); + auto blocklisted_restart = mirror_action.fs_mirror && mirror_action.fs_mirror->is_blocklisted() && + (blocklist_interval > 0 && (mirror_action.fs_mirror->get_blocklisted_ts() - now) > blocklist_interval); + + if (!mirror_action.action_in_progress && !_is_restarting(filesystem)) { + if (failed_restart || blocklisted_restart) { + dout(5) << ": filesystem=" << filesystem << " failed mirroring (failed: " + << failed_restart << ", blocklisted: " << blocklisted_restart << dendl; + _set_restarting(filesystem); + auto peers = mirror_action.fs_mirror->get_peers(); + auto ctx = new C_RestartMirroring(this, filesystem, mirror_action.pool_id, peers); + ctx->complete(0); + } } - if (!failed && !blocklisted && !mirror_action.action_ctxs.empty() + + if (!failed_restart && !blocklisted_restart && !mirror_action.action_ctxs.empty() && !mirror_action.action_in_progress) { auto ctx = std::move(mirror_action.action_ctxs.front()); mirror_action.action_ctxs.pop_front(); ctx->complete(0); } } - - if (check_blocklist) { - m_last_blocklist_check = now; - } - if (check_failure) { - m_last_failure_check = now; - } } schedule_mirror_update_task(); diff --git a/src/tools/cephfs_mirror/Mirror.h b/src/tools/cephfs_mirror/Mirror.h index 74fe5f4d8d3..2081b5b5305 100644 --- a/src/tools/cephfs_mirror/Mirror.h +++ b/src/tools/cephfs_mirror/Mirror.h @@ -101,9 +101,6 @@ private: std::unique_ptr m_cluster_watcher; std::map m_mirror_actions; - utime_t m_last_blocklist_check; - utime_t m_last_failure_check; - RadosRef m_local; std::unique_ptr m_service_daemon; diff --git a/src/tools/cephfs_mirror/MirrorWatcher.cc b/src/tools/cephfs_mirror/MirrorWatcher.cc index 26b88d077e5..b3770d103ea 100644 --- a/src/tools/cephfs_mirror/MirrorWatcher.cc +++ b/src/tools/cephfs_mirror/MirrorWatcher.cc @@ -93,12 +93,15 @@ void MirrorWatcher::handle_rewatch_complete(int r) { dout(0) << ": client blocklisted" <