From afabf82395361f5d2209a980578b4b6715d80f3d Mon Sep 17 00:00:00 2001 From: Matthew Oliver Date: Thu, 28 May 2020 11:59:19 +1000 Subject: [PATCH] cephadm: Detect stale and then recreate connections Currently we make and cache connections to nodes during a check_host. If a cached connection is disconnect from the other end the remoto connection object doesn't track this, so further checks to the host fail. I have pushed up a PR[0] to remoto to add a `has_connection` method to their `BaseConnection` class, which we now use in this patch to check to see if the connection is stale. If it is it is then recreated. There is some monkey patching happening so we can add the required `has_connection` to remoto in this patch which we can remove as soon as the other PR have landed and a new version of remoto is released. [0] https://github.com/alfredodeza/remoto/pull/56 Fixes: https://tracker.ceph.com/issues/45627 Fixes: https://tracker.ceph.com/issues/45032 Signed-off-by: Matthew Oliver --- src/pybind/mgr/cephadm/module.py | 20 ++++++++-- src/pybind/mgr/cephadm/tests/test_cephadm.py | 42 ++++++++++++++++++++ 2 files changed, 58 insertions(+), 4 deletions(-) diff --git a/src/pybind/mgr/cephadm/module.py b/src/pybind/mgr/cephadm/module.py index 9db98b977b9..2362cd71b7e 100644 --- a/src/pybind/mgr/cephadm/module.py +++ b/src/pybind/mgr/cephadm/module.py @@ -43,6 +43,15 @@ from .upgrade import CEPH_UPGRADE_ORDER, CephadmUpgrade try: import remoto + # NOTE(mattoliverau) Patch remoto until remoto PR + # (https://github.com/alfredodeza/remoto/pull/56) lands + from distutils.version import StrictVersion + if StrictVersion(remoto.__version__) <= StrictVersion('1.2'): + def remoto_has_connection(self): + return self.gateway.hasreceiver() + + from remoto.backends import BaseConnection + BaseConnection.has_connection = remoto_has_connection import remoto.process import execnet.gateway_bootstrap except ImportError as e: @@ -816,10 +825,13 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule): """ Setup a connection for running commands on remote host. """ - conn_and_r = self._cons.get(host) - if conn_and_r: - self.log.debug('Have connection to %s' % host) - return conn_and_r + conn, r = self._cons.get(host, (None, None)) + if conn: + if conn.has_connection(): + self.log.debug('Have connection to %s' % host) + return conn, r + else: + self._reset_con(host) n = self.ssh_user + '@' + host self.log.debug("Opening connection to {} with ssh options '{}'".format( n, self._ssh_options)) diff --git a/src/pybind/mgr/cephadm/tests/test_cephadm.py b/src/pybind/mgr/cephadm/tests/test_cephadm.py index cb9453f0367..07a9cdb538d 100644 --- a/src/pybind/mgr/cephadm/tests/test_cephadm.py +++ b/src/pybind/mgr/cephadm/tests/test_cephadm.py @@ -544,3 +544,45 @@ class TestCephadm(object): assert cephadm_module._check_host('test') is None out = wait(cephadm_module, cephadm_module.get_hosts())[0].to_json() assert out == HostSpec('test', 'test').to_json() + + def test_stale_connections(self, cephadm_module): + class Connection(object): + """ + A mocked connection class that only allows the use of the connection + once. If you attempt to use it again via a _check, it'll explode (go + boom!). + + The old code triggers the boom. The new code checks the has_connection + and will recreate the connection. + """ + fuse = False + + @staticmethod + def has_connection(): + return False + + def import_module(self, *args, **kargs): + return mock.Mock() + + @staticmethod + def exit(): + pass + + def _check(conn, *args, **kargs): + if conn.fuse: + raise Exception("boom: connection is dead") + else: + conn.fuse = True + return '{}', None, 0 + with mock.patch("remoto.Connection", side_effect=[Connection(), Connection(), Connection()]): + with mock.patch("remoto.process.check", _check): + with self._with_host(cephadm_module, 'test'): + code, out, err = cephadm_module.check_host('test') + # First should succeed. + assert err is None + + # On second it should attempt to reuse the connection, where the + # connection is "down" so will recreate the connection. The old + # code will blow up here triggering the BOOM! + code, out, err = cephadm_module.check_host('test') + assert err is None -- 2.39.5