mgr/cephadm: convert networks from set to list + don't reset con on down agent hosts

author Adam King <adking@redhat.com>

Mon, 30 Aug 2021 17:39:56 +0000 (13:39 -0400)

committer Adam King <adking@redhat.com>

Fri, 24 Sep 2021 11:23:51 +0000 (07:23 -0400)
author Adam King <adking@redhat.com>
Mon, 30 Aug 2021 17:39:56 +0000 (13:39 -0400)
committer Adam King <adking@redhat.com>
Fri, 24 Sep 2021 11:23:51 +0000 (07:23 -0400)
diff --git a/src/cephadm/cephadm b/src/cephadm/cephadm

index b7af2573c51e6d7552bf34c3013fe31bb877c85f..6aa0bd0cd335134bfd0adb3c81315ab5786b0ada 100755 (executable)
--- a/src/cephadm/cephadm
+++ b/src/cephadm/cephadm
@@ -40,7 +40,7 @@ from configparser import ConfigParser
  from functools import wraps
  from glob import glob
  from io import StringIO
-from threading import Thread, RLock
+from threading import Thread, RLock, Event
  from urllib.error import HTTPError
  from urllib.request import urlopen, Request
  from pathlib import Path
@@ -3538,7 +3538,7 @@ class CephadmAgent():
          self.listener_key_path = os.path.join(self.daemon_dir, 'listener.key')
          self.listener_port = ''
          self.ack = -1
-        self.event = threading.Event()
+        self.event = Event()
          self.mgr_listener = MgrListener(self)
          self.device_enhanced_scan = False
  
@@ -3651,16 +3651,23 @@ WantedBy=ceph-{fsid}.target
  
          while not self.stop:
              ack = self.ack
-
              try:
                  volume = self._ceph_volume(self.device_enhanced_scan)
              except Exception as e:
                  logger.error(f'Failed to get ceph-volume metadata: {e}')
                  volume = ''
  
+            # part of the networks info is returned as a set which is not JSON
+            # serializable. The set must be converted to a list
+            networks = list_networks(self.ctx)
+            networks_list = {}
+            for key in networks.keys():
+                for k, v in networks[key].items():
+                    networks_list[key] = {k: list(v)}
+
              data = json.dumps({'host': self.host,
                                'ls': list_daemons(self.ctx),
-                               'networks': list_networks(self.ctx),
+                               'networks': networks_list,
                                 'facts': HostFacts(self.ctx).dump(),
                                 'volume': volume,
                                 'ack': str(ack),
diff --git a/src/pybind/mgr/cephadm/agent.py b/src/pybind/mgr/cephadm/agent.py

index abc56a1984c7ddd3f2b6f882334b9213a3e3ca5c..cdb42d4bc7293bbe6f526dccc9fc9de19c8864dd 100644 (file)
--- a/src/pybind/mgr/cephadm/agent.py
+++ b/src/pybind/mgr/cephadm/agent.py
@@ -171,7 +171,6 @@ class HostData:
              if 'facts' in data and data['facts']:
                  self.mgr.cache.update_host_facts(host, json.loads(data['facts']))
              if 'volume' in data and data['volume']:
-                self.mgr.log.error(data['volume'])
                  ret = Devices.from_json(json.loads(data['volume']))
                  self.mgr.cache.update_host_devices(host, ret.devices)
  
diff --git a/src/pybind/mgr/cephadm/serve.py b/src/pybind/mgr/cephadm/serve.py

index 7b1b432e431d3c0e77df9fbc8b3554e050326183..e11a0e7e8e2fab74b74be88d60d59d99ca65d2e9 100644 (file)
--- a/src/pybind/mgr/cephadm/serve.py
+++ b/src/pybind/mgr/cephadm/serve.py
@@ -3,7 +3,7 @@ import json
  import logging
  import uuid
  from collections import defaultdict
-from typing import TYPE_CHECKING, Optional, List, cast, Dict, Any, Union, Tuple, Iterator, Set
+from typing import TYPE_CHECKING, Optional, List, cast, Dict, Any, Union, Tuple, Set
  
  from ceph.deployment import inventory
  from ceph.deployment.drive_group import DriveGroupSpec
@@ -269,7 +269,10 @@ class CephadmServe:
                  if host in self.mgr.offline_hosts:
                      return
                  self.mgr.offline_hosts.add(host)
-                self.mgr._reset_con(host)
+                # In case host is actually offline, it's best to reset the connection to avoid
+                # a long timeout trying to use an existing connection to an offline host
+                # REVISIT AFTER https://github.com/ceph/ceph/pull/42919
+                # self.mgr.ssh._reset_con(host)
                  agents_down.append(host)
                  # try to schedule redeploy of agent in case it is individually down
                  try:
@@ -280,7 +283,7 @@ class CephadmServe:
                      self.log.debug(
                          f'Failed to find entry for agent deployed on host {host}. Agent possibly never deployed: {e}')
                  return
-            else:
+            elif self.mgr.use_agent:
                  self.mgr.offline_hosts_remove(host)
  
              if self.mgr.cache.host_needs_check(host):
diff --git a/src/pybind/mgr/cephadm/tests/fixtures.py b/src/pybind/mgr/cephadm/tests/fixtures.py

index bc0fdeeba244e76cf21768eb7f1db4d23dba105a..7fe868392b4dedea12ab3b9f5023ae232353da28 100644 (file)
--- a/src/pybind/mgr/cephadm/tests/fixtures.py
+++ b/src/pybind/mgr/cephadm/tests/fixtures.py
@@ -49,6 +49,7 @@ class MockEventLoopThread:
              loop.close()
              asyncio.set_event_loop(None)
  
+
  def receive_agent_metadata(m: CephadmOrchestrator, host: str, ops: List[str] = None) -> None:
      to_update: Dict[str, Callable[[str, Any], None]] = {
          'ls': m._process_ls_output,
@@ -69,6 +70,7 @@ def receive_agent_metadata_all_hosts(m: CephadmOrchestrator) -> None:
      for host in m.cache.get_hosts():
          receive_agent_metadata(m, host)
  
+
  @contextmanager
  def with_cephadm_module(module_options=None, store=None):
      """
author	Adam King <adking@redhat.com>
	Mon, 30 Aug 2021 17:39:56 +0000 (13:39 -0400)
committer	Adam King <adking@redhat.com>
	Fri, 24 Sep 2021 11:23:51 +0000 (07:23 -0400)
src/cephadm/cephadm		patch \| blob \| history
src/pybind/mgr/cephadm/agent.py		patch \| blob \| history
src/pybind/mgr/cephadm/serve.py		patch \| blob \| history
src/pybind/mgr/cephadm/tests/fixtures.py		patch \| blob \| history