]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
mgr/dashboard: smart: smart data read out on down osd causes error popup 32953/head
authorVolker Theile <vtheile@suse.com>
Wed, 29 Jan 2020 08:36:24 +0000 (09:36 +0100)
committerVolker Theile <vtheile@suse.com>
Tue, 25 Feb 2020 08:17:14 +0000 (09:17 +0100)
Fixes: https://tracker.ceph.com/issues/43665
Signed-off-by: Volker Theile <vtheile@suse.com>
src/pybind/mgr/dashboard/frontend/src/app/ceph/shared/smart-list/smart-list.component.html
src/pybind/mgr/dashboard/frontend/src/app/ceph/shared/smart-list/smart-list.component.ts
src/pybind/mgr/dashboard/services/ceph_service.py

index 16f940e6d4ddbd1966e4e03b05c5a9e1802f4c73..3bc29c518dcfcccd2da0878f26c52180e92be151 100644 (file)
@@ -1,10 +1,16 @@
 <ng-container *ngIf="!loading; else isLoading">
-  <ng-container *ngIf="incompatible; else isCompatible">
-    <cd-alert-panel id="alert-wrong-format"
-                    type="warning"
-                    i18n>The data received has the JSON format version 2.x and is currently incompatible with the dashboard.</cd-alert-panel>
-  </ng-container>
-  <ng-template #isCompatible>
+  <cd-alert-panel *ngIf="error"
+                  type="error"
+                  i18n>Failed to retrieve SMART data.</cd-alert-panel>
+  <cd-alert-panel *ngIf="incompatible"
+                  type="warning"
+                  i18n>The data received has the JSON format version 2.x and is currently incompatible with the dashboard.</cd-alert-panel>
+
+  <ng-container *ngIf="!error && !incompatible">
+    <cd-alert-panel *ngIf="!(data | keyvalue).length"
+                    type="info"
+                    i18n>No SMART data available.</cd-alert-panel>
+
     <tabset *ngFor="let device of data | keyvalue">
       <tab [heading]="device.value.device + ' (' + device.value.identifier + ')'">
         <ng-container *ngIf="device.value.error; else noError">
@@ -55,7 +61,7 @@
         </ng-template>
       </tab>
     </tabset>
-  </ng-template>
+  </ng-container>
 </ng-container>
 <ng-template #isLoading>
   <cd-loading-panel i18n>SMART data is loading.</cd-loading-panel>
index 95281203cf519c9caa6e11fe01b4757ce0ee0253..d530e3462fb3d6cab877071ca9c1e0a2c74c8242 100644 (file)
@@ -27,6 +27,7 @@ export class SmartListComponent implements OnInit, OnChanges {
 
   loading = false;
   incompatible = false;
+  error = false;
 
   data: { [deviceId: string]: SmartDataResult | SmartErrorResult } = {};
 
@@ -130,9 +131,17 @@ export class SmartListComponent implements OnInit, OnChanges {
     this.loading = true;
 
     if (this.osdId !== null) {
-      this.osdService.getSmartData(this.osdId).subscribe(this.fetchData.bind(this));
+      this.osdService.getSmartData(this.osdId).subscribe(this.fetchData.bind(this), (error) => {
+        error.preventDefault();
+        this.error = error;
+        this.loading = false;
+      });
     } else if (this.hostname !== null) {
-      this.hostService.getSmartData(this.hostname).subscribe(this.fetchData.bind(this));
+      this.hostService.getSmartData(this.hostname).subscribe(this.fetchData.bind(this), (error) => {
+        error.preventDefault();
+        this.error = error;
+        this.loading = false;
+      });
     }
   }
 
index 506c2f98dfed44c7a25c10c96ba50034b1fb3a65..f9d2738ff0d76c8fa4597e2742f85b6b417b5394 100644 (file)
@@ -9,13 +9,13 @@ from mgr_module import CommandResult
 from mgr_util import get_time_series_rates, get_most_recent_rate
 
 from .. import mgr
+from ..exceptions import DashboardException
 
 try:
     from typing import Dict, Any  # pylint: disable=unused-import
 except ImportError:
     pass  # For typing only
 
-
 logger = logging.getLogger('ceph_service')
 
 
@@ -166,49 +166,106 @@ class CephService(object):
 
     @staticmethod
     def _get_smart_data_by_device(device):
+        # Check whether the device is associated with daemons.
         if 'daemons' in device and device['daemons']:
-            daemons = [daemon for daemon in device['daemons'] if daemon.startswith('osd')]
-            if daemons:
-                svc_type, svc_id = daemons[0].split('.')
-                dev_smart_data = CephService.send_command(
-                    svc_type, 'smart', svc_id, devid=device['devid'])
+            dev_smart_data = None
+
+            # The daemons associated with the device. Note, the list may
+            # contain daemons that are 'down' or 'destroyed'.
+            daemons = device.get('daemons')
+
+            # Get a list of all OSD daemons on all hosts that are 'up'
+            # because SMART data can not be retrieved from daemons that
+            # are 'down' or 'destroyed'.
+            osd_tree = CephService.send_command('mon', 'osd tree')
+            osd_daemons_up = [
+                node['name'] for node in osd_tree.get('nodes', {})
+                if node.get('status') == 'up'
+            ]
+
+            # Finally get the daemons on the host of the given device
+            # that are 'up'. All daemons on the same host can deliver
+            # SMART data, thus it is not relevant for us which daemon
+            # we are using.
+            daemons = list(set(daemons) & set(osd_daemons_up))
+
+            for daemon in daemons:
+                svc_type, svc_id = daemon.split('.')
+                try:
+                    dev_smart_data = CephService.send_command(
+                        svc_type, 'smart', svc_id, devid=device['devid'])
+                except SendCommandError:
+                    # Try to retrieve SMART data from another daemon.
+                    continue
                 for dev_id, dev_data in dev_smart_data.items():
                     if 'error' in dev_data:
                         logger.warning(
-                            '[SMART] error retrieving smartctl data for device ID "%s": %s', dev_id,
-                            dev_data)
-                return dev_smart_data
-            logger.warning('[SMART] no OSD service found for device ID "%s"', device['devid'])
-            return {}
-        logger.warning('[SMART] key "daemon" not found for device ID "%s"', device['devid'])
+                            '[SMART] Error retrieving smartctl data for device ID "%s": %s',
+                            dev_id, dev_data)
+                break
+            if dev_smart_data is None:
+                raise DashboardException(
+                    'Failed to retrieve SMART data for device ID "{}"'.format(
+                        device['devid']))
+            return dev_smart_data
+        logger.warning('[SMART] No daemons associated with device ID "%s"',
+                       device['devid'])
         return {}
 
     @staticmethod
     def get_devices_by_host(hostname):
         # (str) -> dict
-        return CephService.send_command('mon', 'device ls-by-host', host=hostname)
+        return CephService.send_command('mon',
+                                        'device ls-by-host',
+                                        host=hostname)
+
+    @staticmethod
+    def get_devices_by_daemon(daemon_type, daemon_id):
+        # (str, str) -> dict
+        return CephService.send_command('mon',
+                                        'device ls-by-daemon',
+                                        who='{}.{}'.format(
+                                            daemon_type, daemon_id))
 
     @staticmethod
     def get_smart_data_by_host(hostname):
         # type: (str) -> dict
+        """
+        Get the SMART data of all devices on the given host, regardless
+        of the daemon (osd, mon, ...).
+        :param hostname: The name of the host.
+        :return: A dictionary containing the SMART data of every device
+          on the given host. The device name is used as the key in the
+          dictionary.
+        """
         devices = CephService.get_devices_by_host(hostname)
         smart_data = {}
         if devices:
             for device in devices:
                 if device['devid'] not in smart_data:
-                    smart_data.update(CephService._get_smart_data_by_device(device))
+                    smart_data.update(
+                        CephService._get_smart_data_by_device(device))
         return smart_data
 
     @staticmethod
     def get_smart_data_by_daemon(daemon_type, daemon_id):
         # type: (str, str) -> dict
-        smart_data = CephService.send_command(daemon_type, 'smart', daemon_id)
-        if smart_data:
-            for _, dev_data in smart_data.items():
-                if 'error' in dev_data:
-                    logger.warning('[SMART] Error retrieving smartctl data for daemon "%s.%s"',
-                                   daemon_type, daemon_id)
-        return smart_data or {}
+        """
+        Get the SMART data of the devices associated with the given daemon.
+        :param daemon_type: The daemon type, e.g. 'osd' or 'mon'.
+        :param daemon_id: The daemon identifier.
+        :return: A dictionary containing the SMART data of every device
+          associated with the given daemon. The device name is used as the
+          key in the dictionary.
+        """
+        devices = CephService.get_devices_by_daemon(daemon_type, daemon_id)
+        smart_data = {}
+        if devices:
+            for device in devices:
+                if device['devid'] not in smart_data:
+                    smart_data.update(
+                        CephService._get_smart_data_by_device(device))
+        return smart_data
 
     @classmethod
     def get_rates(cls, svc_type, svc_name, path):