From: Guillaume Abrioux Date: Tue, 9 Apr 2024 15:07:45 +0000 (+0000) Subject: node-proxy: make the daemon discover endpoints X-Git-Tag: testing/wip-vshankar-testing-20240515.171252-debug~12^2 X-Git-Url: http://git.apps.os.sepia.ceph.com/?a=commitdiff_plain;h=ce360a4a5f67329b95a4b75bc7b5e59dc7633637;p=ceph-ci.git node-proxy: make the daemon discover endpoints Add logic in order to explore the API. This will allow node-proxy to be compatible with more hardware. Fixes: https://tracker.ceph.com/issues/65394 Signed-off-by: Guillaume Abrioux --- diff --git a/src/ceph-node-proxy/ceph_node_proxy/baseredfishsystem.py b/src/ceph-node-proxy/ceph_node_proxy/baseredfishsystem.py index ea4e65cc6ac..674a7427e85 100644 --- a/src/ceph-node-proxy/ceph_node_proxy/baseredfishsystem.py +++ b/src/ceph-node-proxy/ceph_node_proxy/baseredfishsystem.py @@ -3,17 +3,126 @@ import json from ceph_node_proxy.basesystem import BaseSystem from ceph_node_proxy.redfish_client import RedFishClient from time import sleep -from ceph_node_proxy.util import get_logger +from ceph_node_proxy.util import get_logger, to_snake_case from typing import Dict, Any, List, Callable, Union from urllib.error import HTTPError, URLError +class EndpointMgr: + NAME: str = 'EndpointMgr' + + def __init__(self, + client: RedFishClient, + prefix: str = RedFishClient.PREFIX) -> None: + self.log = get_logger(f'{__name__}:{EndpointMgr.NAME}') + self.prefix: str = prefix + self.client: RedFishClient = client + + def __getitem__(self, index: str) -> Any: + if index in self.__dict__: + return self.__dict__[index] + else: + raise RuntimeError(f'{index} is not a valid endpoint.') + + def init(self) -> None: + _error_msg: str = "Can't discover entrypoint(s)" + try: + _, _data, _ = self.client.query(endpoint=self.prefix) + json_data: Dict[str, Any] = json.loads(_data) + for k, v in json_data.items(): + if '@odata.id' in v: + self.log.debug(f'entrypoint found: {to_snake_case(k)} = {v["@odata.id"]}') + _name: str = to_snake_case(k) + _url: str = v['@odata.id'] + e = Endpoint(self, _url, self.client) + setattr(self, _name, e) + setattr(self, 'session', json_data['Links']['Sessions']['@odata.id']) # TODO(guits): needs to be fixed + except (URLError, KeyError) as e: + msg = f'{_error_msg}: {e}' + self.log.error(msg) + raise RuntimeError + + +class Endpoint: + NAME: str = 'Endpoint' + + def __init__(self, url: str, client: RedFishClient) -> None: + self.log = get_logger(f'{__name__}:{Endpoint.NAME}') + self.url: str = url + self.client: RedFishClient = client + self.data: Dict[str, Any] = self.get_data() + self.id: str = '' + self.members_names: List[str] = [] + + if self.has_members: + self.members_names = self.get_members_names() + + if self.data: + try: + self.id = self.data['Id'] + except KeyError: + self.id = self.data['@odata.id'].split('/')[-1:] + else: + self.log.warning(f'No data could be loaded for {self.url}') + + def __getitem__(self, index: str) -> Any: + if not getattr(self, index, False): + _url: str = f'{self.url}/{index}' + setattr(self, index, Endpoint(_url, self.client)) + return self.__dict__[index] + + def query(self, url: str) -> Dict[str, Any]: + data: Dict[str, Any] = {} + try: + self.log.debug(f'Querying {url}') + _, _data, _ = self.client.query(endpoint=url) + data = json.loads(_data) + except KeyError as e: + self.log.error(f'Error while querying {self.url}: {e}') + return data + + def get_data(self) -> Dict[str, Any]: + return self.query(self.url) + + def get_members_names(self) -> List[str]: + result: List[str] = [] + if self.has_members: + for member in self.data['Members']: + name: str = member['@odata.id'].split('/')[-1:][0] + result.append(name) + return result + + def get_name(self, endpoint: str) -> str: + return endpoint.split('/')[-1:][0] + + def get_members_endpoints(self) -> Dict[str, str]: + members: Dict[str, str] = {} + name: str = '' + if self.has_members: + for member in self.data['Members']: + name = self.get_name(member['@odata.id']) + members[name] = member['@odata.id'] + else: + name = self.get_name(self.data['@odata.id']) + members[name] = self.data['@odata.id'] + + return members + + def get_members_data(self) -> Dict[str, Any]: + result: Dict[str, Any] = {} + if self.has_members: + for member, endpoint in self.get_members_endpoints().items(): + result[member] = self.query(endpoint) + return result + + @property + def has_members(self) -> bool: + return 'Members' in self.data.keys() + + class BaseRedfishSystem(BaseSystem): def __init__(self, **kw: Any) -> None: super().__init__(**kw) - self.common_endpoints: List[str] = kw.get('common_endpoints', ['/Systems/System.Embedded.1', - '/UpdateService']) - self.chassis_endpoint: str = kw.get('chassis_endpoint', '/Chassis/System.Embedded.1') self.log = get_logger(__name__) self.host: str = kw['host'] self.port: str = kw['port'] @@ -21,6 +130,7 @@ class BaseRedfishSystem(BaseSystem): self.password: str = kw['password'] # move the following line (class attribute?) self.client: RedFishClient = RedFishClient(host=self.host, port=self.port, username=self.username, password=self.password) + self.endpoints: EndpointMgr = EndpointMgr(self.client) self.log.info(f'redfish system initialization, host: {self.host}, user: {self.username}') self.data_ready: bool = False self.previous_data: Dict = {} @@ -48,6 +158,8 @@ class BaseRedfishSystem(BaseSystem): def main(self) -> None: self.stop = False self.client.login() + self.endpoints.init() + while not self.stop: self.log.debug('waiting for a lock in the update loop.') with self.lock: @@ -100,9 +212,7 @@ class BaseRedfishSystem(BaseSystem): return result def get_members(self, data: Dict[str, Any], path: str) -> List: - _path = data[path]['@odata.id'] - _data = self._get_path(_path) - return [self._get_path(member['@odata.id']) for member in _data['Members']] + return [self._get_path(member['@odata.id']) for member in data['Members']] def get_system(self) -> Dict[str, Any]: result = { @@ -117,15 +227,18 @@ class BaseRedfishSystem(BaseSystem): 'fans': self.get_fans() }, 'firmwares': self.get_firmwares(), - 'chassis': {'redfish_endpoint': f'/redfish/v1{self.chassis_endpoint}'} # TODO(guits): not ideal } return result def _update_system(self) -> None: - for endpoint in self.common_endpoints: - result = self.client.get_path(endpoint) - _endpoint = endpoint.strip('/').split('/')[0] - self._system[_endpoint] = result + system_members: Dict[str, Any] = self.endpoints['systems'].get_members_data() + update_service_members: Endpoint = self.endpoints['update_service'] + + for member, data in system_members.items(): + self._system[member] = data + self._sys[member] = dict() + + self._system[update_service_members.id] = update_service_members.data def _update_sn(self) -> None: raise NotImplementedError() @@ -196,7 +309,7 @@ class BaseRedfishSystem(BaseSystem): def set_device_led(self, device: str, data: Dict[str, bool]) -> int: try: - _, response, status = self.client.query( + _, _, status = self.client.query( data=json.dumps(data), method='PATCH', endpoint=self._sys['storage'][device]['redfish_endpoint'] @@ -207,7 +320,7 @@ class BaseRedfishSystem(BaseSystem): return status def get_chassis_led(self) -> Dict[str, Any]: - endpoint = f'/redfish/v1/{self.chassis_endpoint}' + endpoint = list(self.endpoints['chassis'].get_members_endpoints().values())[0] try: result = self.client.query(method='GET', endpoint=endpoint, @@ -227,10 +340,10 @@ class BaseRedfishSystem(BaseSystem): # '{"IndicatorLED": "Lit"}' -> LocationIndicatorActive = false # '{"IndicatorLED": "Blinking"}' -> LocationIndicatorActive = true try: - _, response, status = self.client.query( + _, _, status = self.client.query( data=json.dumps(data), method='PATCH', - endpoint=f'/redfish/v1{self.chassis_endpoint}' + endpoint=list(self.endpoints['chassis'].get_members_endpoints().values())[0] ) except HTTPError as e: self.log.error(f"Couldn't set the ident chassis LED: {e}") @@ -260,7 +373,7 @@ class BaseRedfishSystem(BaseSystem): def create_reboot_job(self, reboot_type: str) -> str: data: Dict[str, str] = dict(RebootJobType=reboot_type) try: - headers, response, status = self.client.query( + headers, _, _ = self.client.query( data=json.dumps(data), endpoint=self.create_reboot_job_endpoint ) @@ -273,7 +386,7 @@ class BaseRedfishSystem(BaseSystem): def schedule_reboot_job(self, job_id: str) -> int: data: Dict[str, Union[List[str], str]] = dict(JobArray=[job_id], StartTimeInterval='TIME_NOW') try: - headers, response, status = self.client.query( + _, _, status = self.client.query( data=json.dumps(data), endpoint=self.setup_job_queue_endpoint ) diff --git a/src/ceph-node-proxy/ceph_node_proxy/redfish_client.py b/src/ceph-node-proxy/ceph_node_proxy/redfish_client.py index 64a4e44dfe3..d75d9a3cc8c 100644 --- a/src/ceph-node-proxy/ceph_node_proxy/redfish_client.py +++ b/src/ceph-node-proxy/ceph_node_proxy/redfish_client.py @@ -22,9 +22,24 @@ class RedFishClient(BaseClient): self.url: str = f'https://{self.host}:{self.port}' self.token: str = '' self.location: str = '' + self.session_service: str = '' + + def sessionservice_discover(self) -> None: + _error_msg: str = "Can't discover SessionService url" + try: + _headers, _data, _status_code = self.query(endpoint=RedFishClient.PREFIX) + json_data: Dict[str, Any] = json.loads(_data) + self.session_service = json_data['Links']['Sessions']['@odata.id'] + except (URLError, KeyError) as e: + msg = f'{_error_msg}: {e}' + self.log.error(msg) + raise RuntimeError def login(self) -> None: if not self.is_logged_in(): + self.log.debug('Discovering SessionService url...') + self.sessionservice_discover() + self.log.debug(f'SessionService url is {self.session_service}') self.log.info('Logging in to ' f"{self.url} as '{self.username}'") oob_credentials = json.dumps({'UserName': self.username, @@ -35,7 +50,7 @@ class RedFishClient(BaseClient): try: _headers, _data, _status_code = self.query(data=oob_credentials, headers=headers, - endpoint='/redfish/v1/SessionService/Sessions/') + endpoint=self.session_service) if _status_code != 201: self.log.error(f"Can't log in to {self.url} as '{self.username}': {_status_code}") raise RuntimeError @@ -119,5 +134,5 @@ class RedFishClient(BaseClient): return response_headers, response_str, response_status except (HTTPError, URLError) as e: - self.log.debug(f'{e}') + self.log.debug(f'endpoint={endpoint} err={e}') raise diff --git a/src/ceph-node-proxy/ceph_node_proxy/redfishdellsystem.py b/src/ceph-node-proxy/ceph_node_proxy/redfishdellsystem.py index ffd88652fbe..8a478fe32f6 100644 --- a/src/ceph-node-proxy/ceph_node_proxy/redfishdellsystem.py +++ b/src/ceph-node-proxy/ceph_node_proxy/redfishdellsystem.py @@ -1,6 +1,7 @@ -from ceph_node_proxy.baseredfishsystem import BaseRedfishSystem +from ceph_node_proxy.baseredfishsystem import BaseRedfishSystem, Endpoint from ceph_node_proxy.util import get_logger, normalize_dict, to_snake_case -from typing import Dict, Any, List +from typing import Dict, Any, List, Optional +from urllib.error import HTTPError class RedfishDellSystem(BaseRedfishSystem): @@ -11,37 +12,43 @@ class RedfishDellSystem(BaseRedfishSystem): self.create_reboot_job_endpoint: str = f'{self.job_service_endpoint}/Actions/DellJobService.CreateRebootJob' self.setup_job_queue_endpoint: str = f'{self.job_service_endpoint}/Actions/DellJobService.SetupJobQueue' - def build_common_data(self, - data: Dict[str, Any], - fields: List, - path: str) -> Dict[str, Dict[str, Dict]]: - result: Dict[str, Dict[str, Dict]] = dict() - for member_info in self.get_members(data, path): - member_id = member_info['Id'] - result[member_id] = dict() + def build_data(self, + data: Dict[str, Any], + fields: List[str], + attribute: Optional[str] = None) -> Dict[str, Dict[str, Dict]]: + result: Dict[str, Dict[str, Optional[Dict]]] = dict() + member_id: str = '' + + def process_data(m_id: str, fields: List[str], data: Dict[str, Any]) -> Dict[str, Any]: + result: Dict[str, Any] = {} for field in fields: try: - result[member_id][to_snake_case(field)] = member_info[field] + result[to_snake_case(field)] = data[field] except KeyError: - self.log.warning(f'Could not find field: {field} in member_info: {member_info}') - - return normalize_dict(result) - - def build_chassis_data(self, - fields: Dict[str, List[str]], - path: str) -> Dict[str, Dict[str, Dict]]: - result: Dict[str, Dict[str, Dict]] = dict() - data = self._get_path(f'{self.chassis_endpoint}/{path}') - - for elt, _fields in fields.items(): - for member_elt in data[elt]: - _id = member_elt['MemberId'] - result[_id] = dict() - for field in _fields: - try: - result[_id][to_snake_case(field)] = member_elt[field] - except KeyError: - self.log.warning(f'Could not find field: {field} in data: {data[elt]}') + self.log.warning(f'Could not find field: {field} in data: {data}') + result[to_snake_case(field)] = None + return result + + try: + if attribute is not None: + data_items = data[attribute] + else: + # The following is a hack to re-inject the key to the dict + # as we have the following structure when `attribute` is passed: + # "PowerSupplies": [ {"MemberId": "0", ...}, {"MemberId": "1", ...} ] + # vs. this structure in the opposite case: + # { "CPU.Socket.2": { "Id": "CPU.Socket.2", "Manufacturer": "Intel" }, "CPU.Socket.1": {} } + # With the first case, we clearly use the field "MemberId". + # With the second case, we use the key of the dict. + # This is mostly for avoiding code duplication. + data_items = [{'MemberId': k, **v} for k, v in data.items()] + for d in data_items: + member_id = d.get('MemberId') + result[member_id] = {} + result[member_id] = process_data(member_id, fields, d) + + except Exception as e: + self.log.error(f"Can't build data: {e}") return normalize_dict(result) def get_sn(self) -> str: @@ -74,9 +81,36 @@ class RedfishDellSystem(BaseRedfishSystem): def _update_network(self) -> None: fields = ['Description', 'Name', 'SpeedMbps', 'Status'] self.log.debug('Updating network') - self._sys['network'] = self.build_common_data(data=self._system['Systems'], - fields=fields, - path='EthernetInterfaces') + self.update('systems', 'network', 'EthernetInterfaces', fields) + + def update(self, + collection: str, + component: str, + path: str, + fields: List[str], + attribute: Optional[str] = None) -> None: + members: List[str] = self.endpoints[collection].get_members_names() + result: Dict[str, Any] = {} + data: Dict[str, Any] = {} + data_built: Dict[str, Any] = {} + if not members: + data = self.endpoints[collection][path].get_members_data() + data_built = self.build_data(data=data, fields=fields, attribute=attribute) + result = data_built + else: + for member in members: + data_built = {} + try: + if attribute is None: + data = self.endpoints[collection][member][path].get_members_data() + else: + data = self.endpoints[collection][member][path].data + except HTTPError as e: + self.log.debug(f'Error while updating {component}: {e}') + else: + data_built = self.build_data(data=data, fields=fields, attribute=attribute) + result[member] = data_built + self._sys[component] = result def _update_processors(self) -> None: fields = ['Description', @@ -87,9 +121,7 @@ class RedfishDellSystem(BaseRedfishSystem): 'Status', 'Manufacturer'] self.log.debug('Updating processors') - self._sys['processors'] = self.build_common_data(data=self._system['Systems'], - fields=fields, - path='Processors') + self.update('systems', 'processors', 'Processors', fields) def _update_storage(self) -> None: fields = ['Description', @@ -98,25 +130,29 @@ class RedfishDellSystem(BaseRedfishSystem): 'LocationIndicatorActive', 'SerialNumber', 'Status', 'PhysicalLocation'] - entities = self.get_members(data=self._system['Systems'], - path='Storage') - self.log.debug('Updating storage') result: Dict[str, Dict[str, Dict]] = dict() - for entity in entities: - for drive in entity['Drives']: - drive_path = drive['@odata.id'] - drive_info = self._get_path(drive_path) - drive_id = drive_info['Id'] - result[drive_id] = dict() - result[drive_id]['redfish_endpoint'] = drive['@odata.id'] - for field in fields: - result[drive_id][to_snake_case(field)] = drive_info[field] - result[drive_id]['entity'] = entity['Id'] - self._sys['storage'] = normalize_dict(result) + self.log.debug('Updating storage') + for member in self.endpoints['systems'].get_members_names(): + result[member] = {} + members_data = self.endpoints['systems'][member]['Storage'].get_members_data() + for entity in members_data: + for drive in members_data[entity]['Drives']: + data: Dict[str, Any] = Endpoint(drive['@odata.id'], self.endpoints.client).data + drive_id = data['Id'] + result[member][drive_id] = dict() + result[member][drive_id]['redfish_endpoint'] = data['@odata.id'] + for field in fields: + result[member][drive_id][to_snake_case(field)] = data[field] + result[member][drive_id]['entity'] = entity + self._sys['storage'] = normalize_dict(result) def _update_sn(self) -> None: + serials: List[str] = [] self.log.debug('Updating serial number') - self._sys['SKU'] = self._system['Systems']['SKU'] + data: Dict[str, Any] = self.endpoints['systems'].get_members_data() + for sys in data.keys(): + serials.append(data[sys]['SKU']) + self._sys['SKU'] = ','.join(serials) def _update_memory(self) -> None: fields = ['Description', @@ -124,32 +160,26 @@ class RedfishDellSystem(BaseRedfishSystem): 'CapacityMiB', 'Status'] self.log.debug('Updating memory') - self._sys['memory'] = self.build_common_data(data=self._system['Systems'], - fields=fields, - path='Memory') + self.update('systems', 'memory', 'Memory', fields) def _update_power(self) -> None: - fields = { - 'PowerSupplies': [ - 'Name', - 'Model', - 'Manufacturer', - 'Status' - ] - } + fields = [ + 'Name', + 'Model', + 'Manufacturer', + 'Status' + ] self.log.debug('Updating powersupplies') - self._sys['power'] = self.build_chassis_data(fields, 'Power') + self.update('chassis', 'power', 'Power', fields, attribute='PowerSupplies') def _update_fans(self) -> None: - fields = { - 'Fans': [ - 'Name', - 'PhysicalContext', - 'Status' - ], - } + fields = [ + 'Name', + 'PhysicalContext', + 'Status' + ] self.log.debug('Updating fans') - self._sys['fans'] = self.build_chassis_data(fields, 'Thermal') + self.update('chassis', 'fans', 'Thermal', fields, attribute='Fans') def _update_firmwares(self) -> None: fields = [ @@ -161,6 +191,4 @@ class RedfishDellSystem(BaseRedfishSystem): 'Status', ] self.log.debug('Updating firmwares') - self._sys['firmwares'] = self.build_common_data(data=self._system['UpdateService'], - fields=fields, - path='FirmwareInventory') + self.update('update_service', 'firmwares', 'FirmwareInventory', fields) diff --git a/src/ceph-node-proxy/ceph_node_proxy/util.py b/src/ceph-node-proxy/ceph_node_proxy/util.py index 359e0c38cf4..c6af0304b92 100644 --- a/src/ceph-node-proxy/ceph_node_proxy/util.py +++ b/src/ceph-node-proxy/ceph_node_proxy/util.py @@ -184,7 +184,9 @@ def http_req(hostname: str = '', response_code = response.code return response_headers, response_str.decode(), response_code except (HTTPError, URLError) as e: - print(f'{e}') + # Log level is debug only. + # We let whatever calls `http_req()` catching and printing the error + logger.debug(f'url={url} err={e}') # handle error here if needed raise diff --git a/src/pybind/mgr/cephadm/agent.py b/src/pybind/mgr/cephadm/agent.py index 12c03901de8..e38122ddc4b 100644 --- a/src/pybind/mgr/cephadm/agent.py +++ b/src/pybind/mgr/cephadm/agent.py @@ -192,16 +192,18 @@ class NodeProxyEndpoint: """ nok_members: List[Dict[str, str]] = [] - for member in data.keys(): - _status = data[member]['status']['health'].lower() - if _status.lower() != 'ok': - state = data[member]['status']['state'] - _member = dict( - member=member, - status=_status, - state=state - ) - nok_members.append(_member) + for sys_id in data.keys(): + for member in data[sys_id].keys(): + _status = data[sys_id][member]['status']['health'].lower() + if _status.lower() != 'ok': + state = data[sys_id][member]['status']['state'] + _member = dict( + sys_id=sys_id, + member=member, + status=_status, + state=state + ) + nok_members.append(_member) return nok_members @@ -229,7 +231,7 @@ class NodeProxyEndpoint: """ for component in data['patch']['status'].keys(): - alert_name = f"HARDWARE_{component.upper()}" + alert_name = f'HARDWARE_{component.upper()}' self.mgr.remove_health_warning(alert_name) nok_members = self.get_nok_members(data['patch']['status'][component]) @@ -239,7 +241,7 @@ class NodeProxyEndpoint: alert_name, summary=f'{count} {component} member{"s" if count > 1 else ""} {"are" if count > 1 else "is"} not ok', count=count, - detail=[f"{member['member']} is {member['status']}: {member['state']}" for member in nok_members], + detail=[f"[{member['sys_id']}]: {member['member']} is {member['status']}: {member['state']}" for member in nok_members], ) @cherrypy.expose diff --git a/src/pybind/mgr/cephadm/inventory.py b/src/pybind/mgr/cephadm/inventory.py index cbaff8a5b00..5ecb142cb6a 100644 --- a/src/pybind/mgr/cephadm/inventory.py +++ b/src/pybind/mgr/cephadm/inventory.py @@ -1410,7 +1410,7 @@ class HostCache(): class NodeProxyCache: - def __init__(self, mgr: "CephadmOrchestrator") -> None: + def __init__(self, mgr: 'CephadmOrchestrator') -> None: self.mgr = mgr self.data: Dict[str, Any] = {} self.oob: Dict[str, Any] = {} @@ -1428,7 +1428,7 @@ class NodeProxyCache: if host not in self.mgr.inventory.keys(): # remove entry for host that no longer exists - self.mgr.set_store(f"{NODE_PROXY_CACHE_PREFIX}/data/{host}", None) + self.mgr.set_store(f'{NODE_PROXY_CACHE_PREFIX}/data/{host}', None) try: self.oob.pop(host) self.data.pop(host) @@ -1442,15 +1442,15 @@ class NodeProxyCache: def save(self, host: str = '', data: Dict[str, Any] = {}) -> None: - self.mgr.set_store(f"{NODE_PROXY_CACHE_PREFIX}/data/{host}", json.dumps(data)) + self.mgr.set_store(f'{NODE_PROXY_CACHE_PREFIX}/data/{host}', json.dumps(data)) def update_oob(self, host: str, host_oob_info: Dict[str, str]) -> None: self.oob[host] = host_oob_info - self.mgr.set_store(f"{NODE_PROXY_CACHE_PREFIX}/oob", json.dumps(self.oob)) + self.mgr.set_store(f'{NODE_PROXY_CACHE_PREFIX}/oob', json.dumps(self.oob)) def update_keyring(self, host: str, key: str) -> None: self.keyrings[host] = key - self.mgr.set_store(f"{NODE_PROXY_CACHE_PREFIX}/keyrings", json.dumps(self.keyrings)) + self.mgr.set_store(f'{NODE_PROXY_CACHE_PREFIX}/keyrings', json.dumps(self.keyrings)) def fullreport(self, **kw: Any) -> Dict[str, Any]: """ @@ -1500,19 +1500,29 @@ class NodeProxyCache: for host in hosts: _result[host] = {} _result[host]['status'] = {} + state: str = '' data = self.data[host] - for component in data['status'].keys(): - values = data['status'][component].values() - if is_error(values): - state = 'error' - elif is_unknown(values): + for component, details in data['status'].items(): + _sys_id_res: List[str] = [] + for element in details.values(): + values = element.values() + if is_error(values): + state = 'error' + elif is_unknown(values) or not values: + state = 'unknown' + else: + state = 'ok' + _sys_id_res.append(state) + if any([s == 'unknown' for s in _sys_id_res]): state = 'unknown' + elif any([s == 'error' for s in _sys_id_res]): + state = 'error' else: state = 'ok' _result[host]['status'][component] = state - _result[host]['sn'] = data['sn'] - _result[host]['host'] = data['host'] - _result[host]['firmwares'] = data['firmwares'] + _result[host]['sn'] = data['sn'] + _result[host]['host'] = data['host'] + _result[host]['status']['firmwares'] = data['firmwares'] return _result def common(self, endpoint: str, **kw: Any) -> Dict[str, Any]: @@ -1562,18 +1572,19 @@ class NodeProxyCache: def get_critical_from_host(self, hostname: str) -> Dict[str, Any]: results: Dict[str, Any] = {} - for component, data_component in self.data[hostname]['status'].items(): - if component not in results.keys(): - results[component] = {} - for member, data_member in data_component.items(): - if component == 'power': - data_member['status']['health'] = 'critical' - data_member['status']['state'] = 'unplugged' - if component == 'memory': - data_member['status']['health'] = 'critical' - data_member['status']['state'] = 'errors detected' - if data_member['status']['health'].lower() != 'ok': - results[component][member] = data_member + for sys_id, component in self.data[hostname]['status'].items(): + for component_name, data_component in component.items(): + if component_name not in results.keys(): + results[component_name] = {} + for member, data_member in data_component.items(): + if component_name == 'power': + data_member['status']['health'] = 'critical' + data_member['status']['state'] = 'unplugged' + if component_name == 'memory': + data_member['status']['health'] = 'critical' + data_member['status']['state'] = 'errors detected' + if data_member['status']['health'].lower() != 'ok': + results[component_name][member] = data_member return results def criticals(self, **kw: Any) -> Dict[str, Any]: diff --git a/src/pybind/mgr/cephadm/tests/node_proxy_data.py b/src/pybind/mgr/cephadm/tests/node_proxy_data.py index 37e6aaa46c8..fa768f1d4c6 100644 --- a/src/pybind/mgr/cephadm/tests/node_proxy_data.py +++ b/src/pybind/mgr/cephadm/tests/node_proxy_data.py @@ -1,3 +1,3 @@ -full_set_with_critical = {'host': 'host01', 'sn': '12345', 'status': {'storage': {'disk.bay.0:enclosure.internal.0-1:raid.integrated.1-1': {'description': 'Solid State Disk 0:1:0', 'entity': 'RAID.Integrated.1-1', 'capacity_bytes': 959656755200, 'model': 'KPM5XVUG960G', 'protocol': 'SAS', 'serial_number': '8080A1CRTP5F', 'status': {'health': 'Critical', 'healthrollup': 'OK', 'state': 'Enabled'}, 'physical_location': {'partlocation': {'locationordinalvalue': 0, 'locationtype': 'Slot'}}}, 'disk.bay.9:enclosure.internal.0-1': {'description': 'PCIe SSD in Slot 9 in Bay 1', 'entity': 'CPU.1', 'capacity_bytes': 1600321314816, 'model': 'Dell Express Flash NVMe P4610 1.6TB SFF', 'protocol': 'PCIe', 'serial_number': 'PHLN035305MN1P6AGN', 'status': {'health': 'Critical', 'healthrollup': 'OK', 'state': 'Enabled'}, 'physical_location': {'partlocation': {'locationordinalvalue': 9, 'locationtype': 'Slot'}}}}, 'processors': {'cpu.socket.2': {'description': 'Represents the properties of a Processor attached to this System', 'total_cores': 20, 'total_threads': 40, 'processor_type': 'CPU', 'model': 'Intel(R) Xeon(R) Gold 6230 CPU @ 2.10GHz', 'status': {'health': 'OK', 'state': 'Enabled'}, 'manufacturer': 'Intel'}}, 'network': {'nic.slot.1-1-1': {'description': 'NIC in Slot 1 Port 1 Partition 1', 'name': 'System Ethernet Interface', 'speed_mbps': 0, 'status': {'health': 'OK', 'state': 'StandbyOffline'}}}, 'memory': {'dimm.socket.a1': {'description': 'DIMM A1', 'memory_device_type': 'DDR4', 'capacity_mi_b': 31237, 'status': {'health': 'Critical', 'state': 'Enabled'}}}}, 'firmwares': {}} +full_set_with_critical = {'host': 'host01', 'sn': '12345', 'status': {'storage': {'1': {'disk.bay.0:enclosure.internal.0-1:raid.integrated.1-1': {'description': 'Solid State Disk 0:1:0', 'entity': 'RAID.Integrated.1-1', 'capacity_bytes': 959656755200, 'model': 'KPM5XVUG960G', 'protocol': 'SAS', 'serial_number': '8080A1CRTP5F', 'status': {'health': 'Critical', 'healthrollup': 'OK', 'state': 'Enabled'}, 'physical_location': {'partlocation': {'locationordinalvalue': 0, 'locationtype': 'Slot'}}}, 'disk.bay.9:enclosure.internal.0-1': {'description': 'PCIe SSD in Slot 9 in Bay 1', 'entity': 'CPU.1', 'capacity_bytes': 1600321314816, 'model': 'Dell Express Flash NVMe P4610 1.6TB SFF', 'protocol': 'PCIe', 'serial_number': 'PHLN035305MN1P6AGN', 'status': {'health': 'Critical', 'healthrollup': 'OK', 'state': 'Enabled'}, 'physical_location': {'partlocation': {'locationordinalvalue': 9, 'locationtype': 'Slot'}}}}}, 'processors': {'1': {'cpu.socket.2': {'description': 'Represents the properties of a Processor attached to this System', 'total_cores': 20, 'total_threads': 40, 'processor_type': 'CPU', 'model': 'Intel(R) Xeon(R) Gold 6230 CPU @ 2.10GHz', 'status': {'health': 'OK', 'state': 'Enabled'}, 'manufacturer': 'Intel'}}}, 'network': {'1': {'nic.slot.1-1-1': {'description': 'NIC in Slot 1 Port 1 Partition 1', 'name': 'System Ethernet Interface', 'speed_mbps': 0, 'status': {'health': 'OK', 'state': 'StandbyOffline'}}}}, 'memory': {'1': {'dimm.socket.a1': {'description': 'DIMM A1', 'memory_device_type': 'DDR4', 'capacity_mi_b': 31237, 'status': {'health': 'Critical', 'state': 'Enabled'}}}}}, 'firmwares': {}} mgr_inventory_cache = {'host01': {'hostname': 'host01', 'addr': '10.10.10.11', 'labels': ['_admin'], 'status': '', 'oob': {'hostname': '10.10.10.11', 'username': 'root', 'password': 'ceph123'}}, 'host02': {'hostname': 'host02', 'addr': '10.10.10.12', 'labels': [], 'status': '', 'oob': {'hostname': '10.10.10.12', 'username': 'root', 'password': 'ceph123'}}} -full_set = {'host01': {'host': 'host01', 'sn': 'FR8Y5X3', 'status': {'storage': {'disk.bay.8:enclosure.internal.0-1:nonraid.slot.2-1': {'description': 'Disk 8 in Backplane 1 of Storage Controller in Slot 2', 'entity': 'NonRAID.Slot.2-1', 'capacity_bytes': 20000588955136, 'model': 'ST20000NM008D-3D', 'protocol': 'SATA', 'serial_number': 'ZVT99QLL', 'status': {'health': 'OK', 'healthrollup': 'OK', 'state': 'Enabled'}, 'physical_location': {'partlocation': {'locationordinalvalue': 8, 'locationtype': 'Slot'}}}}, 'processors': {'cpu.socket.2': {'description': 'Represents the properties of a Processor attached to this System', 'total_cores': 16, 'total_threads': 32, 'processor_type': 'CPU', 'model': 'Intel(R) Xeon(R) Silver 4314 CPU @ 2.40GHz', 'status': {'health': 'OK', 'state': 'Enabled'}, 'manufacturer': 'Intel'}, 'cpu.socket.1': {'description': 'Represents the properties of a Processor attached to this System', 'total_cores': 16, 'total_threads': 32, 'processor_type': 'CPU', 'model': 'Intel(R) Xeon(R) Silver 4314 CPU @ 2.40GHz', 'status': {'health': 'OK', 'state': 'Enabled'}, 'manufacturer': 'Intel'}}, 'network': {'oslogicalnetwork.2': {'description': 'eno8303', 'name': 'eno8303', 'speed_mbps': 0, 'status': {'health': 'OK', 'state': 'Enabled'}}}, 'memory': {'dimm.socket.a1': {'description': 'DIMM A1', 'memory_device_type': 'DDR4', 'capacity_mi_b': 16384, 'status': {'health': 'OK', 'state': 'Enabled'}}}, 'power': {'0': {'name': 'PS1 Status', 'model': 'PWR SPLY,800W,RDNT,LTON', 'manufacturer': 'DELL', 'status': {'health': 'OK', 'state': 'Enabled'}}, '1': {'name': 'PS2 Status', 'model': 'PWR SPLY,800W,RDNT,LTON', 'manufacturer': 'DELL', 'status': {'health': 'OK', 'state': 'Enabled'}}}, 'fans': {'0': {'name': 'System Board Fan1A', 'physical_context': 'SystemBoard', 'status': {'health': 'OK', 'state': 'Enabled'}}}}, 'firmwares': {'installed-28897-6.10.30.20__usc.embedded.1:lc.embedded.1': {'name': 'Lifecycle Controller', 'description': 'Represents Firmware Inventory', 'release_date': '00:00:00Z', 'version': '6.10.30.20', 'updateable': True, 'status': {'health': 'OK', 'state': 'Enabled'}}}}, 'host02': {'host': 'host02', 'sn': 'FR8Y5X4', 'status': {'storage': {'disk.bay.8:enclosure.internal.0-1:nonraid.slot.2-1': {'description': 'Disk 8 in Backplane 1 of Storage Controller in Slot 2', 'entity': 'NonRAID.Slot.2-1', 'capacity_bytes': 20000588955136, 'model': 'ST20000NM008D-3D', 'protocol': 'SATA', 'serial_number': 'ZVT99QLL', 'status': {'health': 'OK', 'healthrollup': 'OK', 'state': 'Enabled'}, 'physical_location': {'partlocation': {'locationordinalvalue': 8, 'locationtype': 'Slot'}}}}, 'processors': {'cpu.socket.2': {'description': 'Represents the properties of a Processor attached to this System', 'total_cores': 16, 'total_threads': 32, 'processor_type': 'CPU', 'model': 'Intel(R) Xeon(R) Silver 4314 CPU @ 2.40GHz', 'status': {'health': 'OK', 'state': 'Enabled'}, 'manufacturer': 'Intel'}, 'cpu.socket.1': {'description': 'Represents the properties of a Processor attached to this System', 'total_cores': 16, 'total_threads': 32, 'processor_type': 'CPU', 'model': 'Intel(R) Xeon(R) Silver 4314 CPU @ 2.40GHz', 'status': {'health': 'OK', 'state': 'Enabled'}, 'manufacturer': 'Intel'}}, 'network': {'oslogicalnetwork.2': {'description': 'eno8303', 'name': 'eno8303', 'speed_mbps': 0, 'status': {'health': 'OK', 'state': 'Enabled'}}}, 'memory': {'dimm.socket.a1': {'description': 'DIMM A1', 'memory_device_type': 'DDR4', 'capacity_mi_b': 16384, 'status': {'health': 'OK', 'state': 'Enabled'}}}, 'power': {'0': {'name': 'PS1 Status', 'model': 'PWR SPLY,800W,RDNT,LTON', 'manufacturer': 'DELL', 'status': {'health': 'OK', 'state': 'Enabled'}}, '1': {'name': 'PS2 Status', 'model': 'PWR SPLY,800W,RDNT,LTON', 'manufacturer': 'DELL', 'status': {'health': 'OK', 'state': 'Enabled'}}}, 'fans': {'0': {'name': 'System Board Fan1A', 'physical_context': 'SystemBoard', 'status': {'health': 'OK', 'state': 'Enabled'}}}}, 'firmwares': {'installed-28897-6.10.30.20__usc.embedded.1:lc.embedded.1': {'name': 'Lifecycle Controller', 'description': 'Represents Firmware Inventory', 'release_date': '00:00:00Z', 'version': '6.10.30.20', 'updateable': True, 'status': {'health': 'OK', 'state': 'Enabled'}}}}} +full_set = {'host01': {'host': 'host01', 'sn': 'FR8Y5X3', 'status': {'storage': {'1': {'disk.bay.8:enclosure.internal.0-1:nonraid.slot.2-1': {'description': 'Disk 8 in Backplane 1 of Storage Controller in Slot 2', 'entity': 'NonRAID.Slot.2-1', 'capacity_bytes': 20000588955136, 'model': 'ST20000NM008D-3D', 'protocol': 'SATA', 'serial_number': 'ZVT99QLL', 'status': {'health': 'OK', 'healthrollup': 'OK', 'state': 'Enabled'}, 'physical_location': {'partlocation': {'locationordinalvalue': 8, 'locationtype': 'Slot'}}}}}, 'processors': {'1': {'cpu.socket.2': {'description': 'Represents the properties of a Processor attached to this System', 'total_cores': 16, 'total_threads': 32, 'processor_type': 'CPU', 'model': 'Intel(R) Xeon(R) Silver 4314 CPU @ 2.40GHz', 'status': {'health': 'OK', 'state': 'Enabled'}, 'manufacturer': 'Intel'}, 'cpu.socket.1': {'description': 'Represents the properties of a Processor attached to this System', 'total_cores': 16, 'total_threads': 32, 'processor_type': 'CPU', 'model': 'Intel(R) Xeon(R) Silver 4314 CPU @ 2.40GHz', 'status': {'health': 'OK', 'state': 'Enabled'}, 'manufacturer': 'Intel'}}}, 'network': {'1': {'oslogicalnetwork.2': {'description': 'eno8303', 'name': 'eno8303', 'speed_mbps': 0, 'status': {'health': 'OK', 'state': 'Enabled'}}}}, 'memory': {'1': {'dimm.socket.a1': {'description': 'DIMM A1', 'memory_device_type': 'DDR4', 'capacity_mi_b': 16384, 'status': {'health': 'OK', 'state': 'Enabled'}}}}, 'power': {'1': {'0': {'name': 'PS1 Status', 'model': 'PWR SPLY,800W,RDNT,LTON', 'manufacturer': 'DELL', 'status': {'health': 'OK', 'state': 'Enabled'}}, '1': {'name': 'PS2 Status', 'model': 'PWR SPLY,800W,RDNT,LTON', 'manufacturer': 'DELL', 'status': {'health': 'OK', 'state': 'Enabled'}}}}, 'fans': {'1': {'0': {'name': 'System Board Fan1A', 'physical_context': 'SystemBoard', 'status': {'health': 'OK', 'state': 'Enabled'}}}}}, 'firmwares': {'installed-28897-6.10.30.20__usc.embedded.1:lc.embedded.1': {'name': 'Lifecycle Controller', 'description': 'Represents Firmware Inventory', 'release_date': '00:00:00Z', 'version': '6.10.30.20', 'updateable': True, 'status': {'health': 'OK', 'state': 'Enabled'}}}}, 'host02': {'host': 'host02', 'sn': 'FR8Y5X4', 'status': {'storage': {'1': {'disk.bay.8:enclosure.internal.0-1:nonraid.slot.2-1': {'description': 'Disk 8 in Backplane 1 of Storage Controller in Slot 2', 'entity': 'NonRAID.Slot.2-1', 'capacity_bytes': 20000588955136, 'model': 'ST20000NM008D-3D', 'protocol': 'SATA', 'serial_number': 'ZVT99QLL', 'status': {'health': 'OK', 'healthrollup': 'OK', 'state': 'Enabled'}, 'physical_location': {'partlocation': {'locationordinalvalue': 8, 'locationtype': 'Slot'}}}}}, 'processors': {'1': {'cpu.socket.2': {'description': 'Represents the properties of a Processor attached to this System', 'total_cores': 16, 'total_threads': 32, 'processor_type': 'CPU', 'model': 'Intel(R) Xeon(R) Silver 4314 CPU @ 2.40GHz', 'status': {'health': 'OK', 'state': 'Enabled'}, 'manufacturer': 'Intel'}, 'cpu.socket.1': {'description': 'Represents the properties of a Processor attached to this System', 'total_cores': 16, 'total_threads': 32, 'processor_type': 'CPU', 'model': 'Intel(R) Xeon(R) Silver 4314 CPU @ 2.40GHz', 'status': {'health': 'OK', 'state': 'Enabled'}, 'manufacturer': 'Intel'}}}, 'network': {'1': {'oslogicalnetwork.2': {'description': 'eno8303', 'name': 'eno8303', 'speed_mbps': 0, 'status': {'health': 'OK', 'state': 'Enabled'}}}}, 'memory': {'1': {'dimm.socket.a1': {'description': 'DIMM A1', 'memory_device_type': 'DDR4', 'capacity_mi_b': 16384, 'status': {'health': 'OK', 'state': 'Enabled'}}}}, 'power': {'1': {'0': {'name': 'PS1 Status', 'model': 'PWR SPLY,800W,RDNT,LTON', 'manufacturer': 'DELL', 'status': {'health': 'OK', 'state': 'Enabled'}}, '1': {'name': 'PS2 Status', 'model': 'PWR SPLY,800W,RDNT,LTON', 'manufacturer': 'DELL', 'status': {'health': 'OK', 'state': 'Enabled'}}}}, 'fans': {'1': {'0': {'name': 'System Board Fan1A', 'physical_context': 'SystemBoard', 'status': {'health': 'OK', 'state': 'Enabled'}}}}}, 'firmwares': {'installed-28897-6.10.30.20__usc.embedded.1:lc.embedded.1': {'name': 'Lifecycle Controller', 'description': 'Represents Firmware Inventory', 'release_date': '00:00:00Z', 'version': '6.10.30.20', 'updateable': True, 'status': {'health': 'OK', 'state': 'Enabled'}}}}} diff --git a/src/pybind/mgr/cephadm/tests/test_node_proxy.py b/src/pybind/mgr/cephadm/tests/test_node_proxy.py index b19bb5dbc50..48c881dda95 100644 --- a/src/pybind/mgr/cephadm/tests/test_node_proxy.py +++ b/src/pybind/mgr/cephadm/tests/test_node_proxy.py @@ -109,12 +109,12 @@ class TestNodeProxyEndpoint(helper.CPWebCase): calls = [call('HARDWARE_STORAGE', count=2, - detail=['disk.bay.0:enclosure.internal.0-1:raid.integrated.1-1 is critical: Enabled', - 'disk.bay.9:enclosure.internal.0-1 is critical: Enabled'], + detail=['[1]: disk.bay.0:enclosure.internal.0-1:raid.integrated.1-1 is critical: Enabled', + '[1]: disk.bay.9:enclosure.internal.0-1 is critical: Enabled'], summary='2 storage members are not ok'), call('HARDWARE_MEMORY', count=1, - detail=['dimm.socket.a1 is critical: Enabled'], + detail=['[1]: dimm.socket.a1 is critical: Enabled'], summary='1 memory member is not ok')] assert TestNodeProxyEndpoint.mgr.set_health_warning.mock_calls == calls diff --git a/src/pybind/mgr/orchestrator/module.py b/src/pybind/mgr/orchestrator/module.py index 4c08ace4dbd..4969e1f5eb7 100644 --- a/src/pybind/mgr/orchestrator/module.py +++ b/src/pybind/mgr/orchestrator/module.py @@ -516,16 +516,16 @@ class OrchestratorCli(OrchestratorClientMixin, MgrModule, :param hostname: hostname """ table_heading_mapping = { - 'summary': ['HOST', 'STORAGE', 'CPU', 'NET', 'MEMORY', 'POWER', 'FANS'], + 'summary': ['HOST', 'SN', 'STORAGE', 'CPU', 'NET', 'MEMORY', 'POWER', 'FANS'], 'fullreport': [], 'firmwares': ['HOST', 'COMPONENT', 'NAME', 'DATE', 'VERSION', 'STATUS'], 'criticals': ['HOST', 'COMPONENT', 'NAME', 'STATUS', 'STATE'], - 'memory': ['HOST', 'NAME', 'STATUS', 'STATE'], - 'storage': ['HOST', 'NAME', 'MODEL', 'SIZE', 'PROTOCOL', 'SN', 'STATUS', 'STATE'], - 'processors': ['HOST', 'NAME', 'MODEL', 'CORES', 'THREADS', 'STATUS', 'STATE'], - 'network': ['HOST', 'NAME', 'SPEED', 'STATUS', 'STATE'], - 'power': ['HOST', 'ID', 'NAME', 'MODEL', 'MANUFACTURER', 'STATUS', 'STATE'], - 'fans': ['HOST', 'ID', 'NAME', 'STATUS', 'STATE'] + 'memory': ['HOST', 'SYS_ID', 'NAME', 'STATUS', 'STATE'], + 'storage': ['HOST', 'SYS_ID', 'NAME', 'MODEL', 'SIZE', 'PROTOCOL', 'SN', 'STATUS', 'STATE'], + 'processors': ['HOST', 'SYS_ID', 'NAME', 'MODEL', 'CORES', 'THREADS', 'STATUS', 'STATE'], + 'network': ['HOST', 'SYS_ID', 'NAME', 'SPEED', 'STATUS', 'STATE'], + 'power': ['HOST', 'CHASSIS_ID', 'ID', 'NAME', 'MODEL', 'MANUFACTURER', 'STATUS', 'STATE'], + 'fans': ['HOST', 'CHASSIS_ID', 'ID', 'NAME', 'STATUS', 'STATE'] } if category not in table_heading_mapping.keys(): @@ -542,21 +542,23 @@ class OrchestratorCli(OrchestratorClientMixin, MgrModule, output = json.dumps(summary) else: for k, v in summary.items(): - row = [k] - row.extend([v['status'][key] for key in ['storage', 'processors', 'network', 'memory', 'power', 'fans']]) + row = [k, v['sn']] + row.extend([v['status'][key] for key in ['storage', 'processors', + 'network', 'memory', + 'power', 'fans']]) table.add_row(row) output = table.get_string() elif category == 'fullreport': if hostname is None: - output = "Missing host name" + output = 'Missing host name' elif format != Format.json: - output = "fullreport only supports json output" + output = 'fullreport only supports json output' else: completion = self.node_proxy_fullreport(hostname=hostname) fullreport: Dict[str, Any] = raise_if_exception(completion) output = json.dumps(fullreport) elif category == 'firmwares': - output = "Missing host name" if hostname is None else self._firmwares_table(hostname, table, format) + output = 'Missing host name' if hostname is None else self._firmwares_table(hostname, table, format) elif category == 'criticals': output = self._criticals_table(hostname, table, format) else: @@ -572,7 +574,11 @@ class OrchestratorCli(OrchestratorClientMixin, MgrModule, return json.dumps(data) for host, details in data.items(): for k, v in details.items(): - table.add_row((host, k, v['name'], v['release_date'], v['version'], v['status']['health'])) + try: + status = v['status']['health'] + except (KeyError, TypeError): + status = 'N/A' + table.add_row((host, k, v['name'], v['release_date'], v['version'], status)) return table.get_string() def _criticals_table(self, hostname: Optional[str], table: PrettyTable, format: Format) -> str: @@ -604,20 +610,21 @@ class OrchestratorCli(OrchestratorClientMixin, MgrModule, } fields = mapping.get(category, ()) - for host, details in data.items(): - for k, v in details.items(): - row = [] - for field in fields: - if field in v: - row.append(v[field]) - elif field in v.get('status', {}): - row.append(v['status'][field]) + for host in data.keys(): + for sys_id, details in data[host].items(): + for k, v in details.items(): + row = [] + for field in fields: + if field in v: + row.append(v[field]) + elif field in v.get('status', {}): + row.append(v['status'][field]) + else: + row.append('') + if category in ('power', 'fans', 'processors'): + table.add_row((host, sys_id,) + (k,) + tuple(row)) else: - row.append('') - if category in ('power', 'fans', 'processors'): - table.add_row((host,) + (k,) + tuple(row)) - else: - table.add_row((host,) + tuple(row)) + table.add_row((host, sys_id,) + tuple(row)) return table.get_string() @@ -643,7 +650,7 @@ class OrchestratorCli(OrchestratorClientMixin, MgrModule, data = raise_if_exception(completion) output: str = '' if action == self.HardwareLightAction.get: - status = 'on' if data["LocationIndicatorActive"] else 'off' + status = 'on' if data['LocationIndicatorActive'] else 'off' if light_type == self.HardwareLightType.device: output = f'ident LED for {device} on {hostname} is: {status}' else: