From 03d94383136d9042a41adcf721c7831fcca33d0f Mon Sep 17 00:00:00 2001 From: Adam King Date: Thu, 24 Mar 2022 23:21:47 -0400 Subject: [PATCH] mgr/cephadm: add keep-alive requests to ssh connections Fixes: https://tracker.ceph.com/issues/51733 Signed-off-by: Adam King --- src/pybind/mgr/cephadm/module.py | 6 ++---- src/pybind/mgr/cephadm/serve.py | 10 +++++++++- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/src/pybind/mgr/cephadm/module.py b/src/pybind/mgr/cephadm/module.py index d8801dba8438c..09514db422f29 100644 --- a/src/pybind/mgr/cephadm/module.py +++ b/src/pybind/mgr/cephadm/module.py @@ -687,10 +687,8 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule, ssh_options += ['-i', tkey.name] self._temp_files = temp_files - if ssh_options: - self._ssh_options = ' '.join(ssh_options) # type: Optional[str] - else: - self._ssh_options = None + ssh_options += ['-o', 'ServerAliveInterval=7', '-o', 'ServerAliveCountMax=3'] + self._ssh_options = ' '.join(ssh_options) # type: Optional[str] if self.mode == 'root': self.ssh_user = self.get_store('ssh_user', default='root') diff --git a/src/pybind/mgr/cephadm/serve.py b/src/pybind/mgr/cephadm/serve.py index 939b50bd30b9a..9112fdbd97940 100644 --- a/src/pybind/mgr/cephadm/serve.py +++ b/src/pybind/mgr/cephadm/serve.py @@ -1265,7 +1265,15 @@ class CephadmServe: if stdin: self.log.debug('stdin: %s' % stdin) - python = connr.choose_python() + try: + # if host has gone offline this is likely where we'll fail first + python = connr.choose_python() + except RuntimeError as e: + self.mgr.offline_hosts.add(host) + self.mgr._reset_con(host) + if error_ok: + return [], [str(e)], 1 + raise if not python: raise RuntimeError( 'unable to find python on %s (tried %s in %s)' % ( -- 2.47.3