]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
cephadm: Detect stale and then recreate connections
authorMatthew Oliver <moliver@suse.com>
Thu, 28 May 2020 01:59:19 +0000 (11:59 +1000)
committerSebastian Wagner <sebastian.wagner@suse.com>
Tue, 2 Jun 2020 12:58:27 +0000 (14:58 +0200)
Currently we make and cache connections to nodes during a check_host.
If a cached connection is disconnect from the other end the remoto
connection object doesn't track this, so further checks to the host
fail.

I have pushed up a PR[0] to remoto to add a `has_connection` method to
their `BaseConnection` class, which we now use in this patch to check to
see if the connection is stale. If it is it is then recreated.

There is some monkey patching happening so we can add the required
`has_connection` to remoto in this patch which we can remove as soon as
the other PR have landed and a new version of remoto is released.

[0] https://github.com/alfredodeza/remoto/pull/56
Fixes: https://tracker.ceph.com/issues/45627
Fixes: https://tracker.ceph.com/issues/45032
Signed-off-by: Matthew Oliver <moliver@suse.com>
(cherry picked from commit afabf82395361f5d2209a980578b4b6715d80f3d)

src/pybind/mgr/cephadm/module.py
src/pybind/mgr/cephadm/tests/test_cephadm.py

index 9db98b977b93d03a90a4ac9598278ba58b23bfa1..2362cd71b7ef70f143bafca9bac9e330582c61e8 100644 (file)
@@ -43,6 +43,15 @@ from .upgrade import CEPH_UPGRADE_ORDER, CephadmUpgrade
 
 try:
     import remoto
+    # NOTE(mattoliverau) Patch remoto until remoto PR
+    # (https://github.com/alfredodeza/remoto/pull/56) lands
+    from distutils.version import StrictVersion
+    if StrictVersion(remoto.__version__) <= StrictVersion('1.2'):
+        def remoto_has_connection(self):
+            return self.gateway.hasreceiver()
+
+        from remoto.backends import BaseConnection
+        BaseConnection.has_connection = remoto_has_connection
     import remoto.process
     import execnet.gateway_bootstrap
 except ImportError as e:
@@ -816,10 +825,13 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule):
         """
         Setup a connection for running commands on remote host.
         """
-        conn_and_r = self._cons.get(host)
-        if conn_and_r:
-            self.log.debug('Have connection to %s' % host)
-            return conn_and_r
+        conn, r = self._cons.get(host, (None, None))
+        if conn:
+            if conn.has_connection():
+                self.log.debug('Have connection to %s' % host)
+                return conn, r
+            else:
+                self._reset_con(host)
         n = self.ssh_user + '@' + host
         self.log.debug("Opening connection to {} with ssh options '{}'".format(
             n, self._ssh_options))
index cb9453f0367082c14e69007051cbcd6c088c2150..07a9cdb538d849a64ae5b90f79dafcc3653fdef1 100644 (file)
@@ -544,3 +544,45 @@ class TestCephadm(object):
             assert cephadm_module._check_host('test') is None
             out = wait(cephadm_module, cephadm_module.get_hosts())[0].to_json()
             assert out == HostSpec('test', 'test').to_json()
+
+    def test_stale_connections(self, cephadm_module):
+        class Connection(object):
+            """
+            A mocked connection class that only allows the use of the connection
+            once. If you attempt to use it again via a _check, it'll explode (go
+            boom!).
+
+            The old code triggers the boom. The new code checks the has_connection
+            and will recreate the connection.
+            """
+            fuse = False
+
+            @staticmethod
+            def has_connection():
+                return False
+
+            def import_module(self, *args, **kargs):
+                return mock.Mock()
+
+            @staticmethod
+            def exit():
+                pass
+
+        def _check(conn, *args, **kargs):
+            if conn.fuse:
+                raise Exception("boom: connection is dead")
+            else:
+                conn.fuse = True
+            return '{}', None, 0
+        with mock.patch("remoto.Connection", side_effect=[Connection(), Connection(), Connection()]):
+            with mock.patch("remoto.process.check", _check):
+                with self._with_host(cephadm_module, 'test'):
+                    code, out, err = cephadm_module.check_host('test')
+                    # First should succeed.
+                    assert err is None
+
+                    # On second it should attempt to reuse the connection, where the
+                    # connection is "down" so will recreate the connection. The old
+                    # code will blow up here triggering the BOOM!
+                    code, out, err = cephadm_module.check_host('test')
+                    assert err is None