From: Paul Cuzner Date: Wed, 10 Feb 2021 22:30:05 +0000 (+1300) Subject: mgr/cephadm: adding check logic X-Git-Tag: v16.2.0~106^2~92 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=ef2ad5a30a4fd0274a5c40a5326eb6c994493c1e;p=ceph.git mgr/cephadm: adding check logic Signed-off-by: Paul Cuzner (cherry picked from commit 5a555ba93854bc8cf7cfd7ead23509fa5782374a) --- diff --git a/src/pybind/mgr/cephadm/configchecks.py b/src/pybind/mgr/cephadm/configchecks.py index 069c1c9a31b4..4c43bfb754ff 100644 --- a/src/pybind/mgr/cephadm/configchecks.py +++ b/src/pybind/mgr/cephadm/configchecks.py @@ -1,51 +1,68 @@ -# import ssl import json -# import sys -# import tempfile -# import time -# import socket import ipaddress import logging -# import inspect -# from threading import Event -# from urllib.request import Request, urlopen -from typing import Any, Dict, List, Set +from typing import TYPE_CHECKING, Any, Dict, List, Optional -logger = logging.getLogger(__name__) - - -class Hosts: - def __init__(self): - self.map = {} - - def add_host(self, hostname: str, hostData: Dict[str, Any]) -> None: - return None - - def hosts_with_role(self, role: str) -> List[str]: - return [] +if TYPE_CHECKING: + from cephadm.module import CephadmOrchestrator - -class HostData: - def __init__(self, json_data: Dict[str, Any]): - data = json_data.get('data', None) - assert data - self.valid = True if data and len(data.keys()) > 1 else False - if self.valid: - data = json_data['data'] - for k in data: - setattr(self, k, data[k]) +logger = logging.getLogger(__name__) -class CephHost: - def __init__(self, json_data: Dict[str, Any]): - self.os = HostData(json_data.get('host', {})) - daemon_section = json_data.get('daemons', {}) - self.daemons = daemon_section['data'] if 'data' in daemon_section else [] +class HostFacts: + + def __init__(self) -> None: + self.arch: Optional[str] = None + self.bios_date: Optional[str] = None + self.bios_version: Optional[str] = None + self.cpu_cores: Optional[int] = None + self.cpu_count: Optional[int] = None + self.cpu_load: Optional[Dict[str, float]] = None + self.cpu_model: Optional[str] = None + self.cpu_threads: Optional[int] = None + self.flash_capacity: Optional[str] = None + self.flash_capacity_bytes: Optional[int] = None + self.flash_count: Optional[int] = None + self.flash_list: Optional[List[Dict[str, Any]]] = None + self.hdd_capacity: Optional[str] = None + self.hdd_capacity_bytes: Optional[int] = None + self.hdd_count: Optional[int] = None + self.hdd_list: Optional[List[Dict[str, Any]]] = None + self.hostname: Optional[str] = None + self.interfaces: Dict[str, Dict[str, Any]] = {} + self.kernel: Optional[str] = None + self.kernel_parameters: Optional[Dict[str, Any]] = None + self.kernel_security: Dict[str, str] = {} + self.memory_available_kb: Optional[int] = None + self.memory_free_kb: Optional[int] = None + self.memory_total_kb: Optional[int] = None + self.model: Optional[str] = None + self.nic_count: Optional[int] = None + self.operating_system: Optional[str] = None + self.subscribed: Optional[str] = None + self.system_uptime: Optional[float] = None + self.timestamp: Optional[float] = None + self.vendor: Optional[str] = None + self._valid = False + + def load_facts(self, json_data: Dict[str, Any]) -> None: + + if isinstance(json_data, dict): + keys = json_data.keys() + if all([k in keys for k in self.__dict__ if not k.startswith('_')]): + self._valid = True + for k in json_data.keys(): + if hasattr(self, k): + setattr(self, k, json_data[k]) + else: + self._valid = False + else: + self._valid = False class SubnetLookup: - def __init__(self, subnet, hostname, mtu, speed): + def __init__(self, subnet: str, hostname: str, mtu: str, speed: str): self.subnet = subnet self.mtu_map = { mtu: [hostname] @@ -54,11 +71,11 @@ class SubnetLookup: speed: [hostname] } - @property + @ property def host_list(self) -> List[str]: return self.mtu_map[next(iter(self.mtu_map))] - def update(self, hostname, mtu, speed) -> None: + def update(self, hostname: str, mtu: str, speed: str) -> None: if mtu in self.mtu_map and hostname not in self.mtu_map[mtu]: self.mtu_map[mtu].append(hostname) else: @@ -69,7 +86,7 @@ class SubnetLookup: else: self.speed_map[speed] = [hostname] - def __repr__(self) -> Dict[str, Any]: + def __repr__(self) -> str: return json.dumps({ "subnet": self.subnet, "mtu_map": self.mtu_map, @@ -78,22 +95,34 @@ class SubnetLookup: class CephadmConfigChecks: - def __init__(self, mgr): - self.mgr = mgr + def __init__(self, mgr: "CephadmOrchestrator"): + self.mgr: "CephadmOrchestrator" = mgr self.log = logger - self.healthchecks = { + self.healthchecks: Dict[str, List[str]] = { "CEPHADM_CHECK_OS": [], "CEPHADM_CHECK_SUBSCRIPTION": [], "CEPHADM_CHECK_KERNEL_LSM": [], "CEPHADM_CHECK_MTU": [], "CEPHADM_CHECK_LINKSPEED": [], "CEPHADM_CHECK_PUBLIC_MEMBERSHIP": [], + "CEPHADM_CHECK_NETWORK_MISSING": [], "CEPHADM_CHECK_DAEMON_DISABLED": [], + "CEPHADM_CHECK_CEPH_VERSION": [], + "CEPHADM_CHECK_KERNEL_VERSION": [], } - self.subnet_lookup = {} # subnet CIDR -> SubnetLookup Object + self.subnet_lookup: Dict[str, SubnetLookup] = {} # subnet CIDR -> SubnetLookup Object + self.lsm_to_host: Dict[str, List[str]] = {} + self.subscribed: Dict[str, List[str]] = { + "yes": [], + "no": [], + } + self.host_to_role: Dict[str, List[Optional[str]]] = {} + self.kernel_to_hosts: Dict[str, List[str]] = {} + + self.public_network_list: List[str] = [] + self.cluster_network_list: List[str] = [] - self.public_network_list = None - self.cluster_network_list = None + def load_network_config(self) -> None: ret, out, _err = self.mgr.mon_command({ 'prefix': 'config dump', 'format': 'json' @@ -133,118 +162,141 @@ class CephadmConfigChecks: if nic['ipv6_address']: pass - def role_from_name(self, name: str) -> str: - return name.split('.')[0] - - def hosts_with_role(self, role_name: str) -> List[str]: + def hosts_with_role(self, role: str) -> List[str]: host_list = [] - for hostname in self.mgr.cache.facts: - host = CephHost(self.mgr.cache.facts[hostname]) - for d in host.daemons: - if self.role_from_name(d['name']) == role_name: - host_list.append(hostname) + for hostname, roles in self.host_to_role.items(): + if role in roles: + host_list.append(hostname) return host_list - def host_roles(self, hostname: str) -> List[str]: - role_list = [] - host = CephHost(self.mgr.cache.facts[hostname]) - for d in host.daemons: - role_list.append(self.role_from_name(d['name'])) - return role_list + def reset(self) -> None: + self.subnet_lookup.clear() + self.lsm_to_host.clear() + self.subscribed['yes'] = [] + self.subscribed['no'] = [] + self.host_to_role.clear() + self.kernel_to_hosts.clear() + + def _check_kernel_lsm(self) -> None: + if len(self.lsm_to_host.keys()) > 1: + # selinux security policy issue - CEPHADM_CHECK_KERNEL_LSM + pass + + def _check_subscription(self) -> None: + if len(self.subscribed['yes']) > 0 and len(self.subscribed['no']) > 0: + # inconsistent subscription states - CEPHADM_CHECK_SUBSCRIPTION + pass + + def _check_public_network(self) -> None: + all_hosts = set(self.mgr.cache.facts.keys()) + for c_net in self.public_network_list: + subnet_data = self.subnet_lookup.get(c_net, None) + if subnet_data: + hosts_in_subnet = set(subnet_data.host_list) + errors = all_hosts.difference(hosts_in_subnet) + if errors: + # CEPHADM_CHECK_PUBLIC_MEMBERSHIP + self.log.error(f"subnet check for {c_net} found {len(errors)} host(s) missing") + + def _check_osd_network(self) -> None: + osd_hosts = set(self.hosts_with_role('osd')) + osd_network_list = self.cluster_network_list or self.public_network_list + for osd_net in osd_network_list: + subnet_data = self.subnet_lookup.get(osd_net, None) + if subnet_data: + + for mtu, host_list in subnet_data.mtu_map.items(): + mtu_hosts = set(host_list) + errors = osd_hosts.difference(mtu_hosts) + if errors: + self.log.error(f"MTU problem {mtu}: {errors}") + # CEPHADM_CHECK_MTU + pass + for speed, host_list in subnet_data.speed_map.items(): + speed_hosts = set(host_list) + errors = osd_hosts.difference(speed_hosts) + if errors: + self.log.error(f"Linkspeed problem {speed} : {errors}") + # CEPHADM_CHECK_LINKSPEED + pass + + else: + # set CEPHADM_CHECK_NETWORK_MISSING healthcheck for this subnet + self.log.warning( + f"Network {osd_net} has been defined, but is not present on any host") + + def _check_version_parity(self) -> None: + upgrade_status = self.mgr.upgrade.upgrade_status() + if upgrade_status.in_progress: + # skip version consistency checks during an upgrade cycle + return + # service_map = self.mgr.get('service_map') + # CEPHADM_CHECK_CEPH_VERSION + pass + + def _check_kernel_version(self) -> None: + if len(self.kernel_to_hosts.keys()) > 1: + self.log.warning("mixed kernel versions") + # CEPHADM_CHECK_KERNEL_VERSION def run_checks(self) -> None: # os_to_host = {} # TODO needs better support in gather-facts - lsm_to_host = {} - subscribed: Dict[str, List[str]] = { - "Yes": [], - "No": [], - } - host_to_role = {} - disabled_daemons = {} - osd_hosts = set() + self.reset() - # build lookup "maps" + # build lookup "maps" by walking the host facts for hostname in self.mgr.cache.facts: self.log.error(json.dumps(self.mgr.cache.facts[hostname])) - # host = CephHost(self.mgr.cache.facts[hostname]) - # # self.log.error(f"object contains {list(host.os.__dict__.keys())}") - # # self.log.error(f"sec {host.os.kernel_security['description']}") - # lsm_to_host[host.os.kernel_security['description']] = hostname - # # self.log.error(f"{host.os.subscribed}") - # subscribed[host.os.subscribed].append(hostname) - - # interfaces = host.os.interfaces - # for name in interfaces: - # if name in ['lo']: - # continue - # self._update_subnet_lookups(hostname, name, interfaces[name]) - - # roles: Set[str] = set() - # # for d in host.daemons: - # # roles.add(self.role_from_name(d['name']) - # # if not d['enabled']: - # # unit_name=d['systemd_unit'] - # # disabled_daemons[unit_name]=hostname - - # host_to_role[hostname] = list(roles) - - # osd_hosts = self.hosts_with_role('osd') - - # # checks - # if len(lsm_to_host.keys()) > 1: - # # selinux security policy issue - CEPHADM_CHECK_KERNEL_LSM - # pass - # if len(subscribed['Yes']) > 0 and len(subscribed['No']) > 0: - # # inconsistent subscription states - CEPHADM_CHECK_SUBSCRIPTION - # pass - - # # all hosts must have NIC on the public network - # all_hosts = set(self.mgr.cache.facts.keys()) - # for c_net in self.public_network_list: - # subnet_data = self.subnet_lookup.get(c_net, None) - # if subnet_data: - # hosts_in_subnet = set(subnet_data.host_list) - # errors = all_hosts.difference(hosts_in_subnet) - # if errors: - # # CEPHADM_CHECK_PUBLIC_MEMBERSHIP - # self.log.error(f"subnet check for {c_net} found {len(errors)} host(s) missing") - - # # all hosts with the osd role must share a common mtu and speed - # osd_network_list = self.cluster_network_list or self.public_network_list - # for osd_net in osd_network_list: - # subnet_data = self.subnet_lookup.get(osd_net, None) - # if subnet_data: - - # # FIXME this feels overly complex - # # what do I want to show as the error? - # for mtu, host_list in subnet_data.mtu_map.items(): - # mtu_hosts = set(host_list) - # errors = osd_hosts.difference(mtu_hosts) - # if errors: - # self.log.error(f"MTU problem {mtu}: {errors}") - # # CEPHADM_CHECK_MTU - # pass - # for speed, host_list in subnet_data.speed_map.items(): - # speed_hosts = set(host_list) - # errors = osd_hosts.difference(speed_hosts) - # if errors: - # self.log.error(f"Linkspeed problem {speed} : {errors}") - # # CEPHADM_CHECK_LINKSPEED - # pass - - # else: - # self.log.warning( - # f"Network {osd_net} has been defined, but is not present on any host") - - # # All daemons should be in an enabled state - # if disabled_daemons: - # self.log.error("disabled daemons found") - # # CEPHADM_CHECK_DAEMON_DISABLED - # pass - - # self.log.error(f"lsm status {json.dumps(lsm_to_host)}") - # self.log.error(f"subscription states : {json.dumps(subscribed)}") - - # self.log.error(f"mtu data : {str(self.subnet_lookup)}") - # self.log.error(f"roles : {json.dumps(host_to_role)}") + host = HostFacts() + host.load_facts(self.mgr.cache.facts[hostname]) + if not host._valid: + self.log.warning(f"skipping {hostname} - incompatible host facts") + continue + + lsm_desc = host.kernel_security.get('description', '') + if lsm_desc: + if lsm_desc in self.lsm_to_host: + self.lsm_to_host[lsm_desc].append(hostname) + else: + self.lsm_to_host[lsm_desc] = [hostname] + + subscription_state = host.subscribed.lower() if host.subscribed else None + if subscription_state: + self.subscribed[subscription_state].append(hostname) + + interfaces: Dict[str, Dict[str, Any]] = host.interfaces + for name in interfaces.keys(): + if name in ['lo']: + continue + self._update_subnet_lookups(hostname, name, interfaces[name]) + + if host.kernel: + kernel_maj_min = '.'.join(host.kernel.split('.')[0:2]) + if kernel_maj_min in self.kernel_to_hosts: + self.kernel_to_hosts[kernel_maj_min].append(hostname) + else: + self.kernel_to_hosts[kernel_maj_min] = [hostname] + else: + self.log.warning(f"Host gather facts for {hostname} is missing kernel information") + + host_daemons = self.mgr.cache.daemons[hostname] + + # NOTE: should daemondescription provide the systemd enabled state? + self.host_to_role[hostname] = list({host_daemons[name].daemon_type + for name in host_daemons}) + + # checks + # TODO only call the check if the check is enabled + self._check_kernel_lsm() + self._check_subscription() + self._check_public_network() + self._check_osd_network() + self._check_version_parity() + self._check_kernel_version() + + self.log.error(f"lsm status {json.dumps(self.lsm_to_host)}") + self.log.error(f"subscription states : {json.dumps(self.subscribed)}") + + self.log.error(f"mtu data : {str(self.subnet_lookup)}") + self.log.error(f"roles : {json.dumps(self.host_to_role)}") + self.log.error(f"kernel versions : {json.dumps(self.kernel_to_hosts)}")