From 036c02a5c2815a8aea8e7723d136389edd7ff643 Mon Sep 17 00:00:00 2001 From: Volker Theile Date: Wed, 29 Jan 2020 09:36:24 +0100 Subject: [PATCH] mgr/dashboard: smart: smart data read out on down osd causes error popup Fixes: https://tracker.ceph.com/issues/43665 Signed-off-by: Volker Theile --- .../smart-list/smart-list.component.html | 20 ++-- .../shared/smart-list/smart-list.component.ts | 13 ++- .../mgr/dashboard/services/ceph_service.py | 99 +++++++++++++++---- 3 files changed, 102 insertions(+), 30 deletions(-) diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/shared/smart-list/smart-list.component.html b/src/pybind/mgr/dashboard/frontend/src/app/ceph/shared/smart-list/smart-list.component.html index 16f940e6d4d..3bc29c518dc 100644 --- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/shared/smart-list/smart-list.component.html +++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/shared/smart-list/smart-list.component.html @@ -1,10 +1,16 @@ - - The data received has the JSON format version 2.x and is currently incompatible with the dashboard. - - + Failed to retrieve SMART data. + The data received has the JSON format version 2.x and is currently incompatible with the dashboard. + + + No SMART data available. + @@ -55,7 +61,7 @@ - + SMART data is loading. diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/shared/smart-list/smart-list.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/shared/smart-list/smart-list.component.ts index 95281203cf5..d530e3462fb 100644 --- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/shared/smart-list/smart-list.component.ts +++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/shared/smart-list/smart-list.component.ts @@ -27,6 +27,7 @@ export class SmartListComponent implements OnInit, OnChanges { loading = false; incompatible = false; + error = false; data: { [deviceId: string]: SmartDataResult | SmartErrorResult } = {}; @@ -130,9 +131,17 @@ export class SmartListComponent implements OnInit, OnChanges { this.loading = true; if (this.osdId !== null) { - this.osdService.getSmartData(this.osdId).subscribe(this.fetchData.bind(this)); + this.osdService.getSmartData(this.osdId).subscribe(this.fetchData.bind(this), (error) => { + error.preventDefault(); + this.error = error; + this.loading = false; + }); } else if (this.hostname !== null) { - this.hostService.getSmartData(this.hostname).subscribe(this.fetchData.bind(this)); + this.hostService.getSmartData(this.hostname).subscribe(this.fetchData.bind(this), (error) => { + error.preventDefault(); + this.error = error; + this.loading = false; + }); } } diff --git a/src/pybind/mgr/dashboard/services/ceph_service.py b/src/pybind/mgr/dashboard/services/ceph_service.py index 506c2f98dfe..f9d2738ff0d 100644 --- a/src/pybind/mgr/dashboard/services/ceph_service.py +++ b/src/pybind/mgr/dashboard/services/ceph_service.py @@ -9,13 +9,13 @@ from mgr_module import CommandResult from mgr_util import get_time_series_rates, get_most_recent_rate from .. import mgr +from ..exceptions import DashboardException try: from typing import Dict, Any # pylint: disable=unused-import except ImportError: pass # For typing only - logger = logging.getLogger('ceph_service') @@ -166,49 +166,106 @@ class CephService(object): @staticmethod def _get_smart_data_by_device(device): + # Check whether the device is associated with daemons. if 'daemons' in device and device['daemons']: - daemons = [daemon for daemon in device['daemons'] if daemon.startswith('osd')] - if daemons: - svc_type, svc_id = daemons[0].split('.') - dev_smart_data = CephService.send_command( - svc_type, 'smart', svc_id, devid=device['devid']) + dev_smart_data = None + + # The daemons associated with the device. Note, the list may + # contain daemons that are 'down' or 'destroyed'. + daemons = device.get('daemons') + + # Get a list of all OSD daemons on all hosts that are 'up' + # because SMART data can not be retrieved from daemons that + # are 'down' or 'destroyed'. + osd_tree = CephService.send_command('mon', 'osd tree') + osd_daemons_up = [ + node['name'] for node in osd_tree.get('nodes', {}) + if node.get('status') == 'up' + ] + + # Finally get the daemons on the host of the given device + # that are 'up'. All daemons on the same host can deliver + # SMART data, thus it is not relevant for us which daemon + # we are using. + daemons = list(set(daemons) & set(osd_daemons_up)) + + for daemon in daemons: + svc_type, svc_id = daemon.split('.') + try: + dev_smart_data = CephService.send_command( + svc_type, 'smart', svc_id, devid=device['devid']) + except SendCommandError: + # Try to retrieve SMART data from another daemon. + continue for dev_id, dev_data in dev_smart_data.items(): if 'error' in dev_data: logger.warning( - '[SMART] error retrieving smartctl data for device ID "%s": %s', dev_id, - dev_data) - return dev_smart_data - logger.warning('[SMART] no OSD service found for device ID "%s"', device['devid']) - return {} - logger.warning('[SMART] key "daemon" not found for device ID "%s"', device['devid']) + '[SMART] Error retrieving smartctl data for device ID "%s": %s', + dev_id, dev_data) + break + if dev_smart_data is None: + raise DashboardException( + 'Failed to retrieve SMART data for device ID "{}"'.format( + device['devid'])) + return dev_smart_data + logger.warning('[SMART] No daemons associated with device ID "%s"', + device['devid']) return {} @staticmethod def get_devices_by_host(hostname): # (str) -> dict - return CephService.send_command('mon', 'device ls-by-host', host=hostname) + return CephService.send_command('mon', + 'device ls-by-host', + host=hostname) + + @staticmethod + def get_devices_by_daemon(daemon_type, daemon_id): + # (str, str) -> dict + return CephService.send_command('mon', + 'device ls-by-daemon', + who='{}.{}'.format( + daemon_type, daemon_id)) @staticmethod def get_smart_data_by_host(hostname): # type: (str) -> dict + """ + Get the SMART data of all devices on the given host, regardless + of the daemon (osd, mon, ...). + :param hostname: The name of the host. + :return: A dictionary containing the SMART data of every device + on the given host. The device name is used as the key in the + dictionary. + """ devices = CephService.get_devices_by_host(hostname) smart_data = {} if devices: for device in devices: if device['devid'] not in smart_data: - smart_data.update(CephService._get_smart_data_by_device(device)) + smart_data.update( + CephService._get_smart_data_by_device(device)) return smart_data @staticmethod def get_smart_data_by_daemon(daemon_type, daemon_id): # type: (str, str) -> dict - smart_data = CephService.send_command(daemon_type, 'smart', daemon_id) - if smart_data: - for _, dev_data in smart_data.items(): - if 'error' in dev_data: - logger.warning('[SMART] Error retrieving smartctl data for daemon "%s.%s"', - daemon_type, daemon_id) - return smart_data or {} + """ + Get the SMART data of the devices associated with the given daemon. + :param daemon_type: The daemon type, e.g. 'osd' or 'mon'. + :param daemon_id: The daemon identifier. + :return: A dictionary containing the SMART data of every device + associated with the given daemon. The device name is used as the + key in the dictionary. + """ + devices = CephService.get_devices_by_daemon(daemon_type, daemon_id) + smart_data = {} + if devices: + for device in devices: + if device['devid'] not in smart_data: + smart_data.update( + CephService._get_smart_data_by_device(device)) + return smart_data @classmethod def get_rates(cls, svc_type, svc_name, path): -- 2.47.3