From: Guillaume Abrioux Date: Wed, 27 Sep 2023 08:27:28 +0000 (+0000) Subject: node-proxy: split redfishdell class X-Git-Tag: v18.2.4~314^2~59 X-Git-Url: http://git.apps.os.sepia.ceph.com/?a=commitdiff_plain;h=83964cf27ea8fb61ec753dc3aed62a10bc8082f4;p=ceph.git node-proxy: split redfishdell class This refactors split the redfishdell class in order to collect power and thermal details from the redfish API. 'power' and 'thermal' details are very different in many points: - not available at the same endpoint, - data structure is different. For these two reasons, let's split that class. Signed-off-by: Guillaume Abrioux (cherry picked from commit db0172186a753d57c357a5396378d1158e3167e3) --- diff --git a/src/cephadm/cephadmlib/node_proxy/baseredfishsystem.py b/src/cephadm/cephadmlib/node_proxy/baseredfishsystem.py new file mode 100644 index 0000000000000..2e1d9087d7978 --- /dev/null +++ b/src/cephadm/cephadmlib/node_proxy/baseredfishsystem.py @@ -0,0 +1,139 @@ +from .basesystem import BaseSystem +from .redfish_client import RedFishClient +from threading import Thread, Lock +from time import sleep +from .util import Logger, retry +from typing import Dict, Any, List + + +class BaseRedfishSystem(BaseSystem): + def __init__(self, **kw: Any) -> None: + super().__init__(**kw) + self.log = Logger(__name__) + self.host: str = kw['host'] + self.username: str = kw['username'] + self.password: str = kw['password'] + # move the following line (class attribute?) + self.client = RedFishClient(host=self.host, username=self.username, password=self.password) + self.log.logger.info(f"redfish system initialization, host: {self.host}, user: {self.username}") + + self.run: bool = False + self.thread: Thread + self.data_ready: bool = False + self.previous_data: Dict = {} + self.lock: Lock = Lock() + self.data: Dict[str, Dict[str, Any]] = {} + self._system: Dict[str, Dict[str, Any]] = {} + self.start_client() + + def start_client(self) -> None: + if not self.client: + self.client = RedFishClient(host=self.host, username=self.username, password=self.password) + self.client.login() + + def start_update_loop(self) -> None: + self.run = True + self.thread = Thread(target=self.update) + self.thread.start() + + def stop_update_loop(self) -> None: + self.run = False + self.thread.join() + + def update(self) -> None: + # this loop can have: + # - caching logic + try: + while self.run: + self.log.logger.debug("waiting for a lock.") + self.lock.acquire() + self.log.logger.debug("lock acquired.") + try: + self._update_system() + # following calls in theory can be done in parallel + self._update_metadata() + self._update_memory() + self._update_power() + self._update_fans() + self._update_network() + self._update_processors() + self._update_storage() + self.data_ready = True + sleep(5) + finally: + self.lock.release() + self.log.logger.debug("lock released.") + # Catching 'Exception' is probably not a good idea (devel only) + except Exception as e: + self.log.logger.error(f"Error detected, logging out from redfish api.\n{e}") + self.client.logout() + raise + + def flush(self) -> None: + self.log.logger.info("Acquiring lock to flush data.") + self.lock.acquire() + self.log.logger.info("Lock acquired, flushing data.") + self._system = {} + self.previous_data = {} + self.log.logger.info("Data flushed.") + self.data_ready = False + self.log.logger.info("Data marked as not ready.") + self.lock.release() + self.log.logger.info("Lock released.") + + @retry(retries=10, delay=2) + def _get_path(self, path: str) -> Dict: + result = self.client.get_path(path) + if result is None: + self.log.logger.error(f"The client reported an error when getting path: {path}") + raise RuntimeError(f"Could not get path: {path}") + return result + + def get_members(self, path: str) -> List: + _path = self._system[path]['@odata.id'] + data = self._get_path(_path) + return [self._get_path(member['@odata.id']) for member in data['Members']] + + def build_data(self, + fields: List, + path: str) -> Dict[str, Dict[str, Dict]]: + raise NotImplementedError() + + # def _update_system(self) -> None: + # raise NotImplementedError() + + def get_system(self) -> Dict[str, Dict[str, Dict]]: + result = { + 'storage': self.get_storage(), + 'processors': self.get_processors(), + 'network': self.get_network(), + 'memory': self.get_memory(), + 'power': self.get_power(), + 'fans': self.get_fans() + } + return result + + def _update_system(self) -> None: + redfish_system = self.client.get_path(self.system_endpoint) + self._system = {**redfish_system, **self._system} + + def _update_metadata(self) -> None: + raise NotImplementedError() + + def _update_memory(self) -> None: + raise NotImplementedError() + + def _update_power(self) -> None: + raise NotImplementedError() + + def _update_fans(self) -> None: + raise NotImplementedError() + + def _update_network(self) -> None: + raise NotImplementedError() + + def _update_processors(self) -> None: + raise NotImplementedError() + + def _update_storage(self) -> None: + raise NotImplementedError() diff --git a/src/cephadm/cephadmlib/node_proxy/redfish_dell.py b/src/cephadm/cephadmlib/node_proxy/redfish_dell.py index 796c989f8834d..f6a01664629c9 100644 --- a/src/cephadm/cephadmlib/node_proxy/redfish_dell.py +++ b/src/cephadm/cephadmlib/node_proxy/redfish_dell.py @@ -1,63 +1,14 @@ -from .redfish_system import RedfishSystem -from .util import Logger, normalize_dict, to_snake_case -from typing import Dict, Any +from .redfishdellchassis import RedfishDellChassis +from .redfishdellsystem import RedfishDellSystem +from .util import Logger +from typing import Any -class RedfishDell(RedfishSystem): +class RedfishDell(RedfishDellSystem, RedfishDellChassis): def __init__(self, **kw: Any) -> None: - self.log = Logger(__name__) if kw.get('system_endpoint') is None: kw['system_endpoint'] = '/Systems/System.Embedded.1' + if kw.get('chassis_endpoint') is None: + kw['chassis_endpoint'] = '/Chassis/System.Embedded.1' super().__init__(**kw) - - def _update_network(self) -> None: - fields = ['Description', 'Name', 'SpeedMbps', 'Status'] - self.log.logger.info("Updating network") - self._system['network'] = self.build_data(fields, 'EthernetInterfaces') - - def _update_processors(self) -> None: - fields = ['Description', - 'TotalCores', - 'TotalThreads', - 'ProcessorType', - 'Model', - 'Status', - 'Manufacturer'] - self.log.logger.info("Updating processors") - self._system['processors'] = self.build_data(fields, 'Processors') - - def _update_storage(self) -> None: - fields = ['Description', - 'CapacityBytes', - 'Model', 'Protocol', - 'SerialNumber', 'Status', - 'PhysicalLocation'] - entities = self.get_members('Storage') - self.log.logger.info("Updating storage") - result: Dict[str, Dict[str, Dict]] = dict() - for entity in entities: - for drive in entity['Drives']: - drive_path = drive['@odata.id'] - drive_info = self._get_path(drive_path) - drive_id = drive_info['Id'] - result[drive_id] = dict() - for field in fields: - result[drive_id][to_snake_case(field)] = drive_info[field] - result[drive_id]['entity'] = entity['Id'] - self._system['storage'] = normalize_dict(result) - - def _update_metadata(self) -> None: - self.log.logger.info("Updating metadata") - pass - - def _update_memory(self) -> None: - fields = ['Description', - 'MemoryDeviceType', - 'CapacityMiB', - 'Status'] - self.log.logger.info("Updating memory") - self._system['memory'] = self.build_data(fields, 'Memory') - - def _update_power(self) -> None: - self.log.logger.info("Updating power") - pass + self.log = Logger(__name__) diff --git a/src/cephadm/cephadmlib/node_proxy/redfish_system.py b/src/cephadm/cephadmlib/node_proxy/redfish_system.py deleted file mode 100644 index 95c82960eae1c..0000000000000 --- a/src/cephadm/cephadmlib/node_proxy/redfish_system.py +++ /dev/null @@ -1,160 +0,0 @@ -from .basesystem import BaseSystem -from .redfish_client import RedFishClient -from threading import Thread, Lock -from time import sleep -from .util import Logger, retry, normalize_dict, to_snake_case -from typing import Dict, Any, List - - -class RedfishSystem(BaseSystem): - def __init__(self, **kw: Any) -> None: - super().__init__(**kw) - self.log = Logger(__name__) - self.host: str = kw['host'] - self.username: str = kw['username'] - self.password: str = kw['password'] - self.system_endpoint = kw.get('system_endpoint', '/Systems/1') - self.client = RedFishClient(host=self.host, username=self.username, password=self.password) - self.log.logger.info(f"redfish system initialization, host: {self.host}, user: {self.username}") - - self._system: Dict[str, Dict[str, Any]] = {} - self.run: bool = False - self.thread: Thread - self.start_client() - self.data_ready: bool = False - self.previous_data: Dict = {} - self.lock: Lock = Lock() - - @retry(retries=10, delay=2) - def _get_path(self, path: str) -> Dict: - result = self.client.get_path(path) - if result is None: - self.log.logger.error(f"The client reported an error when getting path: {path}") - raise RuntimeError(f"Could not get path: {path}") - return result - - def get_members(self, path: str) -> List: - _path = self._system[path]['@odata.id'] - data = self._get_path(_path) - return [self._get_path(member['@odata.id']) for member in data['Members']] - - def build_data(self, - fields: List, - path: str) -> Dict[str, Dict[str, Dict]]: - result: Dict[str, Dict[str, Dict]] = dict() - for member_info in self.get_members(path): - member_id = member_info['Id'] - result[member_id] = dict() - for field in fields: - try: - result[member_id][to_snake_case(field)] = member_info[field] - except KeyError: - self.log.logger.warning(f"Could not find field: {field} in member_info: {member_info}") - - return normalize_dict(result) - - def start_client(self) -> None: - if not self.client: - self.client = RedFishClient(host=self.host, username=self.username, password=self.password) - self.client.login() - - def get_system(self) -> Dict[str, Dict[str, Dict]]: - result = { - 'storage': self.get_storage(), - 'processors': self.get_processors(), - 'network': self.get_network(), - 'memory': self.get_memory(), - } - return result - - def get_status(self) -> Dict[str, Dict[str, Dict]]: - return self._system['status'] - - def get_metadata(self) -> Dict[str, Dict[str, Dict]]: - return self._system['metadata'] - - def get_memory(self) -> Dict[str, Dict[str, Dict]]: - return self._system['memory'] - - def get_power(self) -> Dict[str, Dict[str, Dict]]: - return self._system['power'] - - def get_processors(self) -> Dict[str, Dict[str, Dict]]: - return self._system['processors'] - - def get_network(self) -> Dict[str, Dict[str, Dict]]: - return self._system['network'] - - def get_storage(self) -> Dict[str, Dict[str, Dict]]: - return self._system['storage'] - - def _update_system(self) -> None: - redfish_system = self.client.get_path(self.system_endpoint) - self._system = {**redfish_system, **self._system} - - def _update_metadata(self) -> None: - raise NotImplementedError() - - def _update_memory(self) -> None: - raise NotImplementedError() - - def _update_power(self) -> None: - raise NotImplementedError() - - def _update_network(self) -> None: - raise NotImplementedError() - - def _update_processors(self) -> None: - raise NotImplementedError() - - def _update_storage(self) -> None: - raise NotImplementedError() - - def start_update_loop(self) -> None: - self.run = True - self.thread = Thread(target=self.update) - self.thread.start() - - def stop_update_loop(self) -> None: - self.run = False - self.thread.join() - - def update(self) -> None: - # this loop can have: - # - caching logic - try: - while self.run: - self.log.logger.debug("waiting for a lock.") - self.lock.acquire() - self.log.logger.debug("lock acquired.") - try: - self._update_system() - # following calls in theory can be done in parallel - self._update_metadata() - self._update_memory() - self._update_power() - self._update_network() - self._update_processors() - self._update_storage() - self.data_ready = True - sleep(5) - finally: - self.lock.release() - self.log.logger.debug("lock released.") - # Catching 'Exception' is probably not a good idea (devel only) - except Exception as e: - self.log.logger.error(f"Error detected, logging out from redfish api.\n{e}") - self.client.logout() - raise - - def flush(self) -> None: - self.log.logger.info("Acquiring lock to flush data.") - self.lock.acquire() - self.log.logger.info("Lock acquired, flushing data.") - self._system = {} - self.previous_data = {} - self.log.logger.info("Data flushed.") - self.data_ready = False - self.log.logger.info("Data marked as not ready.") - self.lock.release() - self.log.logger.info("Lock released.") diff --git a/src/cephadm/cephadmlib/node_proxy/redfishdellchassis.py b/src/cephadm/cephadmlib/node_proxy/redfishdellchassis.py new file mode 100644 index 0000000000000..39610dc744562 --- /dev/null +++ b/src/cephadm/cephadmlib/node_proxy/redfishdellchassis.py @@ -0,0 +1,67 @@ +from .baseredfishsystem import BaseRedfishSystem +from .redfish_client import RedFishClient +from threading import Thread, Lock +from time import sleep +from .util import Logger, retry, normalize_dict, to_snake_case +from typing import Dict, Any, List, Union + + +class RedfishDellChassis(BaseRedfishSystem): + def __init__(self, **kw: Any) -> None: + self.chassis_endpoint = kw.get('chassis_endpoint', '/Chassis/System.Embedded.1') + super().__init__(**kw) + self.log = Logger(__name__) + self.log.logger.info(f"{__name__} initialization.") + + def get_power(self) -> Dict[str, Dict[str, Dict]]: + return self._system['power'] + + def get_fans(self) -> Dict[str, Dict[str, Dict]]: + return self._system['fans'] + + def get_chassis(self) -> Dict[str, Dict[str, Dict]]: + result = { + 'power': self.get_power(), + 'fans': self.get_fans() + } + return result + + def _update_power(self) -> None: + fields = { + "PowerSupplies": [ + "Name", + "Model", + "Manufacturer", + "Status" + ] + } + self.log.logger.info("Updating powersupplies") + self._system['power'] = self.build_chassis_data(fields, 'Power') + + def _update_fans(self) -> None: + fields = { + "Fans": [ + "Name", + "PhysicalContext", + "Status" + ], + } + self.log.logger.info("Updating fans") + self._system['fans'] = self.build_chassis_data(fields, 'Thermal') + + def build_chassis_data(self, + fields: Dict[str, List[str]], + path: str) -> Dict[str, Dict[str, Dict]]: + result: Dict[str, Dict[str, Dict]] = dict() + data = self._get_path(f"{self.chassis_endpoint}/{path}") + + for elt, _fields in fields.items(): + for member_elt in data[elt]: + _id = member_elt['MemberId'] + result[_id] = dict() + for field in _fields: + try: + result[_id][to_snake_case(field)] = member_elt[field] + except KeyError: + self.log.logger.warning(f"Could not find field: {field} in data: {data[elt]}") + return normalize_dict(result) diff --git a/src/cephadm/cephadmlib/node_proxy/redfishdellsystem.py b/src/cephadm/cephadmlib/node_proxy/redfishdellsystem.py new file mode 100644 index 0000000000000..de9756fe79fe8 --- /dev/null +++ b/src/cephadm/cephadmlib/node_proxy/redfishdellsystem.py @@ -0,0 +1,95 @@ +from .baseredfishsystem import BaseRedfishSystem +from .util import Logger, normalize_dict, to_snake_case +from typing import Dict, Any, List + + +class RedfishDellSystem(BaseRedfishSystem): + def __init__(self, **kw: Any) -> None: + self.system_endpoint = kw.get('systemd_endpoint', '/Systems/System.Embedded.1') + super().__init__(**kw) + self.log = Logger(__name__) + + def build_system_data(self, + fields: List, + path: str) -> Dict[str, Dict[str, Dict]]: + result: Dict[str, Dict[str, Dict]] = dict() + for member_info in self.get_members(path): + member_id = member_info['Id'] + result[member_id] = dict() + for field in fields: + try: + result[member_id][to_snake_case(field)] = member_info[field] + except KeyError: + self.log.logger.warning(f"Could not find field: {field} in member_info: {member_info}") + + return normalize_dict(result) + + def get_status(self) -> Dict[str, Dict[str, Dict]]: + return self._system['status'] + + def get_metadata(self) -> Dict[str, Dict[str, Dict]]: + return self._system['metadata'] + + def get_memory(self) -> Dict[str, Dict[str, Dict]]: + return self._system['memory'] + + def get_processors(self) -> Dict[str, Dict[str, Dict]]: + return self._system['processors'] + + def get_network(self) -> Dict[str, Dict[str, Dict]]: + return self._system['network'] + + def get_storage(self) -> Dict[str, Dict[str, Dict]]: + return self._system['storage'] + + # def _update_system(self) -> None: + # redfish_system = self.client.get_path(self.system_endpoint) + # self._system = {**redfish_system, **self._system} + + def _update_network(self) -> None: + fields = ['Description', 'Name', 'SpeedMbps', 'Status'] + self.log.logger.info("Updating network") + self._system['network'] = self.build_system_data(fields, 'EthernetInterfaces') + + def _update_processors(self) -> None: + fields = ['Description', + 'TotalCores', + 'TotalThreads', + 'ProcessorType', + 'Model', + 'Status', + 'Manufacturer'] + self.log.logger.info("Updating processors") + self._system['processors'] = self.build_system_data(fields, 'Processors') + + def _update_storage(self) -> None: + fields = ['Description', + 'CapacityBytes', + 'Model', 'Protocol', + 'SerialNumber', 'Status', + 'PhysicalLocation'] + entities = self.get_members('Storage') + self.log.logger.info("Updating storage") + result: Dict[str, Dict[str, Dict]] = dict() + for entity in entities: + for drive in entity['Drives']: + drive_path = drive['@odata.id'] + drive_info = self._get_path(drive_path) + drive_id = drive_info['Id'] + result[drive_id] = dict() + for field in fields: + result[drive_id][to_snake_case(field)] = drive_info[field] + result[drive_id]['entity'] = entity['Id'] + self._system['storage'] = normalize_dict(result) + + def _update_metadata(self) -> None: + self.log.logger.info("Updating metadata") + pass + + def _update_memory(self) -> None: + fields = ['Description', + 'MemoryDeviceType', + 'CapacityMiB', + 'Status'] + self.log.logger.info("Updating memory") + self._system['memory'] = self.build_system_data(fields, 'Memory') diff --git a/src/pybind/mgr/cephadm/agent.py b/src/pybind/mgr/cephadm/agent.py index d3376d2358890..697f097e14348 100644 --- a/src/pybind/mgr/cephadm/agent.py +++ b/src/pybind/mgr/cephadm/agent.py @@ -147,7 +147,10 @@ class NodeProxy: # Force a fake error for testing purpose if component == 'storage': _status = 'critical' - state = "Fake error" + state = "[Fake error] device is faulty." + elif component == 'power': + _status = 'critical' + state = "[Fake error] power supply unplugged." else: _status = data[component][member]['status']['health'].lower() if _status.lower() != 'ok': @@ -167,6 +170,8 @@ class NodeProxy: 'memory': 'NODE_PROXY_MEMORY', 'processors': 'NODE_PROXY_PROCESSORS', 'network': 'NODE_PROXY_NETWORK', + 'power': 'NODE_PROXY_POWER', + 'fans': 'NODE_PROXY_FANS' } for component in data['data'].keys():