]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
mgr/cephadm: adding check logic
authorPaul Cuzner <pcuzner@redhat.com>
Wed, 10 Feb 2021 22:30:05 +0000 (11:30 +1300)
committerSage Weil <sage@newdream.net>
Tue, 16 Mar 2021 12:56:17 +0000 (07:56 -0500)
Signed-off-by: Paul Cuzner <pcuzner@redhat.com>
(cherry picked from commit 5a555ba93854bc8cf7cfd7ead23509fa5782374a)

src/pybind/mgr/cephadm/configchecks.py

index 069c1c9a31b409656f8d620841f02a7d255a5d3d..4c43bfb754ff851bcdbac9599518d9e4cecfcf65 100644 (file)
@@ -1,51 +1,68 @@
-# import ssl
 import json
-# import sys
-# import tempfile
-# import time
-# import socket
 import ipaddress
 import logging
-# import inspect
-# from threading import Event
-# from urllib.request import Request, urlopen
 
-from typing import Any, Dict, List, Set
+from typing import TYPE_CHECKING, Any, Dict, List, Optional
 
-logger = logging.getLogger(__name__)
-
-
-class Hosts:
-    def __init__(self):
-        self.map = {}
-
-    def add_host(self, hostname: str, hostData: Dict[str, Any]) -> None:
-        return None
-
-    def hosts_with_role(self, role: str) -> List[str]:
-        return []
+if TYPE_CHECKING:
+    from cephadm.module import CephadmOrchestrator
 
-
-class HostData:
-    def __init__(self, json_data: Dict[str, Any]):
-        data = json_data.get('data', None)
-        assert data
-        self.valid = True if data and len(data.keys()) > 1 else False
-        if self.valid:
-            data = json_data['data']
-            for k in data:
-                setattr(self, k, data[k])
+logger = logging.getLogger(__name__)
 
 
-class CephHost:
-    def __init__(self, json_data: Dict[str, Any]):
-        self.os = HostData(json_data.get('host', {}))
-        daemon_section = json_data.get('daemons', {})
-        self.daemons = daemon_section['data'] if 'data' in daemon_section else []
+class HostFacts:
+
+    def __init__(self) -> None:
+        self.arch: Optional[str] = None
+        self.bios_date: Optional[str] = None
+        self.bios_version: Optional[str] = None
+        self.cpu_cores: Optional[int] = None
+        self.cpu_count: Optional[int] = None
+        self.cpu_load: Optional[Dict[str, float]] = None
+        self.cpu_model: Optional[str] = None
+        self.cpu_threads: Optional[int] = None
+        self.flash_capacity: Optional[str] = None
+        self.flash_capacity_bytes: Optional[int] = None
+        self.flash_count: Optional[int] = None
+        self.flash_list: Optional[List[Dict[str, Any]]] = None
+        self.hdd_capacity: Optional[str] = None
+        self.hdd_capacity_bytes: Optional[int] = None
+        self.hdd_count: Optional[int] = None
+        self.hdd_list: Optional[List[Dict[str, Any]]] = None
+        self.hostname: Optional[str] = None
+        self.interfaces: Dict[str, Dict[str, Any]] = {}
+        self.kernel: Optional[str] = None
+        self.kernel_parameters: Optional[Dict[str, Any]] = None
+        self.kernel_security: Dict[str, str] = {}
+        self.memory_available_kb: Optional[int] = None
+        self.memory_free_kb: Optional[int] = None
+        self.memory_total_kb: Optional[int] = None
+        self.model: Optional[str] = None
+        self.nic_count: Optional[int] = None
+        self.operating_system: Optional[str] = None
+        self.subscribed: Optional[str] = None
+        self.system_uptime: Optional[float] = None
+        self.timestamp: Optional[float] = None
+        self.vendor: Optional[str] = None
+        self._valid = False
+
+    def load_facts(self, json_data: Dict[str, Any]) -> None:
+
+        if isinstance(json_data, dict):
+            keys = json_data.keys()
+            if all([k in keys for k in self.__dict__ if not k.startswith('_')]):
+                self._valid = True
+                for k in json_data.keys():
+                    if hasattr(self, k):
+                        setattr(self, k, json_data[k])
+            else:
+                self._valid = False
+        else:
+            self._valid = False
 
 
 class SubnetLookup:
-    def __init__(self, subnet, hostname, mtu, speed):
+    def __init__(self, subnet: str, hostname: str, mtu: str, speed: str):
         self.subnet = subnet
         self.mtu_map = {
             mtu: [hostname]
@@ -54,11 +71,11 @@ class SubnetLookup:
             speed: [hostname]
         }
 
-    @property
+    @ property
     def host_list(self) -> List[str]:
         return self.mtu_map[next(iter(self.mtu_map))]
 
-    def update(self, hostname, mtu, speed) -> None:
+    def update(self, hostname: str, mtu: str, speed: str) -> None:
         if mtu in self.mtu_map and hostname not in self.mtu_map[mtu]:
             self.mtu_map[mtu].append(hostname)
         else:
@@ -69,7 +86,7 @@ class SubnetLookup:
         else:
             self.speed_map[speed] = [hostname]
 
-    def __repr__(self) -> Dict[str, Any]:
+    def __repr__(self) -> str:
         return json.dumps({
             "subnet": self.subnet,
             "mtu_map": self.mtu_map,
@@ -78,22 +95,34 @@ class SubnetLookup:
 
 
 class CephadmConfigChecks:
-    def __init__(self, mgr):
-        self.mgr = mgr
+    def __init__(self, mgr: "CephadmOrchestrator"):
+        self.mgr: "CephadmOrchestrator" = mgr
         self.log = logger
-        self.healthchecks = {
+        self.healthchecks: Dict[str, List[str]] = {
             "CEPHADM_CHECK_OS": [],
             "CEPHADM_CHECK_SUBSCRIPTION": [],
             "CEPHADM_CHECK_KERNEL_LSM": [],
             "CEPHADM_CHECK_MTU": [],
             "CEPHADM_CHECK_LINKSPEED": [],
             "CEPHADM_CHECK_PUBLIC_MEMBERSHIP": [],
+            "CEPHADM_CHECK_NETWORK_MISSING": [],
             "CEPHADM_CHECK_DAEMON_DISABLED": [],
+            "CEPHADM_CHECK_CEPH_VERSION": [],
+            "CEPHADM_CHECK_KERNEL_VERSION": [],
         }
-        self.subnet_lookup = {}  # subnet CIDR -> SubnetLookup Object
+        self.subnet_lookup: Dict[str, SubnetLookup] = {}  # subnet CIDR -> SubnetLookup Object
+        self.lsm_to_host: Dict[str, List[str]] = {}
+        self.subscribed: Dict[str, List[str]] = {
+            "yes": [],
+            "no": [],
+        }
+        self.host_to_role: Dict[str, List[Optional[str]]] = {}
+        self.kernel_to_hosts: Dict[str, List[str]] = {}
+
+        self.public_network_list: List[str] = []
+        self.cluster_network_list: List[str] = []
 
-        self.public_network_list = None
-        self.cluster_network_list = None
+    def load_network_config(self) -> None:
         ret, out, _err = self.mgr.mon_command({
             'prefix': 'config dump',
             'format': 'json'
@@ -133,118 +162,141 @@ class CephadmConfigChecks:
         if nic['ipv6_address']:
             pass
 
-    def role_from_name(self, name: str) -> str:
-        return name.split('.')[0]
-
-    def hosts_with_role(self, role_name: str) -> List[str]:
+    def hosts_with_role(self, role: str) -> List[str]:
         host_list = []
-        for hostname in self.mgr.cache.facts:
-            host = CephHost(self.mgr.cache.facts[hostname])
-            for d in host.daemons:
-                if self.role_from_name(d['name']) == role_name:
-                    host_list.append(hostname)
+        for hostname, roles in self.host_to_role.items():
+            if role in roles:
+                host_list.append(hostname)
         return host_list
 
-    def host_roles(self, hostname: str) -> List[str]:
-        role_list = []
-        host = CephHost(self.mgr.cache.facts[hostname])
-        for d in host.daemons:
-            role_list.append(self.role_from_name(d['name']))
-        return role_list
+    def reset(self) -> None:
+        self.subnet_lookup.clear()
+        self.lsm_to_host.clear()
+        self.subscribed['yes'] = []
+        self.subscribed['no'] = []
+        self.host_to_role.clear()
+        self.kernel_to_hosts.clear()
+
+    def _check_kernel_lsm(self) -> None:
+        if len(self.lsm_to_host.keys()) > 1:
+            # selinux security policy issue - CEPHADM_CHECK_KERNEL_LSM
+            pass
+
+    def _check_subscription(self) -> None:
+        if len(self.subscribed['yes']) > 0 and len(self.subscribed['no']) > 0:
+            # inconsistent subscription states - CEPHADM_CHECK_SUBSCRIPTION
+            pass
+
+    def _check_public_network(self) -> None:
+        all_hosts = set(self.mgr.cache.facts.keys())
+        for c_net in self.public_network_list:
+            subnet_data = self.subnet_lookup.get(c_net, None)
+            if subnet_data:
+                hosts_in_subnet = set(subnet_data.host_list)
+                errors = all_hosts.difference(hosts_in_subnet)
+                if errors:
+                    # CEPHADM_CHECK_PUBLIC_MEMBERSHIP
+                    self.log.error(f"subnet check for {c_net} found {len(errors)} host(s) missing")
+
+    def _check_osd_network(self) -> None:
+        osd_hosts = set(self.hosts_with_role('osd'))
+        osd_network_list = self.cluster_network_list or self.public_network_list
+        for osd_net in osd_network_list:
+            subnet_data = self.subnet_lookup.get(osd_net, None)
+            if subnet_data:
+
+                for mtu, host_list in subnet_data.mtu_map.items():
+                    mtu_hosts = set(host_list)
+                    errors = osd_hosts.difference(mtu_hosts)
+                    if errors:
+                        self.log.error(f"MTU problem {mtu}: {errors}")
+                        # CEPHADM_CHECK_MTU
+                        pass
+                for speed, host_list in subnet_data.speed_map.items():
+                    speed_hosts = set(host_list)
+                    errors = osd_hosts.difference(speed_hosts)
+                    if errors:
+                        self.log.error(f"Linkspeed problem {speed} : {errors}")
+                        # CEPHADM_CHECK_LINKSPEED
+                        pass
+
+            else:
+                # set CEPHADM_CHECK_NETWORK_MISSING healthcheck for this subnet
+                self.log.warning(
+                    f"Network {osd_net} has been defined, but is not present on any host")
+
+    def _check_version_parity(self) -> None:
+        upgrade_status = self.mgr.upgrade.upgrade_status()
+        if upgrade_status.in_progress:
+            # skip version consistency checks during an upgrade cycle
+            return
+        # service_map = self.mgr.get('service_map')
+        # CEPHADM_CHECK_CEPH_VERSION
+        pass
+
+    def _check_kernel_version(self) -> None:
+        if len(self.kernel_to_hosts.keys()) > 1:
+            self.log.warning("mixed kernel versions")
+            # CEPHADM_CHECK_KERNEL_VERSION
 
     def run_checks(self) -> None:
 
         # os_to_host = {}  # TODO needs better support in gather-facts
-        lsm_to_host = {}
-        subscribed: Dict[str, List[str]] = {
-            "Yes": [],
-            "No": [],
-        }
-        host_to_role = {}
-        disabled_daemons = {}
-        osd_hosts = set()
+        self.reset()
 
-        # build lookup "maps"
+        # build lookup "maps" by walking the host facts
         for hostname in self.mgr.cache.facts:
             self.log.error(json.dumps(self.mgr.cache.facts[hostname]))
-        #     host = CephHost(self.mgr.cache.facts[hostname])
-        #     # self.log.error(f"object contains {list(host.os.__dict__.keys())}")
-        #     # self.log.error(f"sec {host.os.kernel_security['description']}")
-        #     lsm_to_host[host.os.kernel_security['description']] = hostname
-        #     # self.log.error(f"{host.os.subscribed}")
-        #     subscribed[host.os.subscribed].append(hostname)
-
-        #     interfaces = host.os.interfaces
-        #     for name in interfaces:
-        #         if name in ['lo']:
-        #             continue
-        #         self._update_subnet_lookups(hostname, name, interfaces[name])
-
-        #     roles: Set[str] = set()
-        #     # for d in host.daemons:
-        #     #     roles.add(self.role_from_name(d['name'])
-        #     #     if not d['enabled']:
-        #     #         unit_name=d['systemd_unit']
-        #     #         disabled_daemons[unit_name]=hostname
-
-        #     host_to_role[hostname] = list(roles)
-
-        # osd_hosts = self.hosts_with_role('osd')
-
-        # # checks
-        # if len(lsm_to_host.keys()) > 1:
-        #     # selinux security policy issue - CEPHADM_CHECK_KERNEL_LSM
-        #     pass
-        # if len(subscribed['Yes']) > 0 and len(subscribed['No']) > 0:
-        #     # inconsistent subscription states - CEPHADM_CHECK_SUBSCRIPTION
-        #     pass
-
-        # # all hosts must have NIC on the public network
-        # all_hosts = set(self.mgr.cache.facts.keys())
-        # for c_net in self.public_network_list:
-        #     subnet_data = self.subnet_lookup.get(c_net, None)
-        #     if subnet_data:
-        #         hosts_in_subnet = set(subnet_data.host_list)
-        #         errors = all_hosts.difference(hosts_in_subnet)
-        #         if errors:
-        #             # CEPHADM_CHECK_PUBLIC_MEMBERSHIP
-        #             self.log.error(f"subnet check for {c_net} found {len(errors)} host(s) missing")
-
-        # # all hosts with the osd role must share a common mtu and speed
-        # osd_network_list = self.cluster_network_list or self.public_network_list
-        # for osd_net in osd_network_list:
-        #     subnet_data = self.subnet_lookup.get(osd_net, None)
-        #     if subnet_data:
-
-        #         # FIXME this feels overly complex
-        #         # what do I want to show as the error?
-        #         for mtu, host_list in subnet_data.mtu_map.items():
-        #             mtu_hosts = set(host_list)
-        #             errors = osd_hosts.difference(mtu_hosts)
-        #             if errors:
-        #                 self.log.error(f"MTU problem {mtu}: {errors}")
-        #                 # CEPHADM_CHECK_MTU
-        #                 pass
-        #         for speed, host_list in subnet_data.speed_map.items():
-        #             speed_hosts = set(host_list)
-        #             errors = osd_hosts.difference(speed_hosts)
-        #             if errors:
-        #                 self.log.error(f"Linkspeed problem {speed} : {errors}")
-        #                 # CEPHADM_CHECK_LINKSPEED
-        #                 pass
-
-        #     else:
-        #         self.log.warning(
-        #             f"Network {osd_net} has been defined, but is not present on any host")
-
-        # # All daemons should be in an enabled state
-        # if disabled_daemons:
-        #     self.log.error("disabled daemons found")
-        #     # CEPHADM_CHECK_DAEMON_DISABLED
-        #     pass
-
-        # self.log.error(f"lsm status {json.dumps(lsm_to_host)}")
-        # self.log.error(f"subscription states : {json.dumps(subscribed)}")
-
-        # self.log.error(f"mtu data : {str(self.subnet_lookup)}")
-        # self.log.error(f"roles : {json.dumps(host_to_role)}")
+            host = HostFacts()
+            host.load_facts(self.mgr.cache.facts[hostname])
+            if not host._valid:
+                self.log.warning(f"skipping {hostname} - incompatible host facts")
+                continue
+
+            lsm_desc = host.kernel_security.get('description', '')
+            if lsm_desc:
+                if lsm_desc in self.lsm_to_host:
+                    self.lsm_to_host[lsm_desc].append(hostname)
+                else:
+                    self.lsm_to_host[lsm_desc] = [hostname]
+
+            subscription_state = host.subscribed.lower() if host.subscribed else None
+            if subscription_state:
+                self.subscribed[subscription_state].append(hostname)
+
+            interfaces: Dict[str, Dict[str, Any]] = host.interfaces
+            for name in interfaces.keys():
+                if name in ['lo']:
+                    continue
+                self._update_subnet_lookups(hostname, name, interfaces[name])
+
+            if host.kernel:
+                kernel_maj_min = '.'.join(host.kernel.split('.')[0:2])
+                if kernel_maj_min in self.kernel_to_hosts:
+                    self.kernel_to_hosts[kernel_maj_min].append(hostname)
+                else:
+                    self.kernel_to_hosts[kernel_maj_min] = [hostname]
+            else:
+                self.log.warning(f"Host gather facts for {hostname} is missing kernel information")
+
+            host_daemons = self.mgr.cache.daemons[hostname]
+
+            # NOTE: should daemondescription provide the systemd enabled state?
+            self.host_to_role[hostname] = list({host_daemons[name].daemon_type
+                                                for name in host_daemons})
+
+        # checks
+        # TODO only call the check if the check is enabled
+        self._check_kernel_lsm()
+        self._check_subscription()
+        self._check_public_network()
+        self._check_osd_network()
+        self._check_version_parity()
+        self._check_kernel_version()
+
+        self.log.error(f"lsm status {json.dumps(self.lsm_to_host)}")
+        self.log.error(f"subscription states : {json.dumps(self.subscribed)}")
+
+        self.log.error(f"mtu data : {str(self.subnet_lookup)}")
+        self.log.error(f"roles : {json.dumps(self.host_to_role)}")
+        self.log.error(f"kernel versions : {json.dumps(self.kernel_to_hosts)}")