]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
mgr/dashboard: fix: get SMART data from single-daemon device 44573/head
authorAlfonso Martínez <almartin@redhat.com>
Thu, 13 Jan 2022 14:20:48 +0000 (15:20 +0100)
committerAlfonso Martínez <almartin@redhat.com>
Thu, 13 Jan 2022 14:20:48 +0000 (15:20 +0100)
Return SMART data even when a device is only associated with a single daemon.

Fixes: https://tracker.ceph.com/issues/53858
Signed-off-by: Alfonso Martínez <almartin@redhat.com>
src/pybind/mgr/dashboard/services/ceph_service.py
src/pybind/mgr/dashboard/tests/test_ceph_service.py

index c1e0db2daa01f8a376bbc4aab6d3fe4d3c42822d..b261dc6b02b0c8d879939500e7933e63a12d7304 100644 (file)
@@ -8,7 +8,6 @@ from mgr_module import CommandResult
 from mgr_util import get_most_recent_rate, get_time_series_rates
 
 from .. import mgr
-from ..exceptions import DashboardException
 
 try:
     from typing import Any, Dict, Optional, Union
@@ -235,11 +234,7 @@ class CephService(object):
         # type: (dict) -> Dict[str, dict]
         # Check whether the device is associated with daemons.
         if 'daemons' in device and device['daemons']:
-            dev_smart_data = None
-
-            # The daemons associated with the device. Note, the list may
-            # contain daemons that are 'down' or 'destroyed'.
-            daemons = device.get('daemons')
+            dev_smart_data: Dict[str, Any] = {}
 
             # Get a list of all OSD daemons on all hosts that are 'up'
             # because SMART data can not be retrieved from daemons that
@@ -250,26 +245,27 @@ class CephService(object):
                 if node.get('status') == 'up'
             ]
 
-            # Finally get the daemons on the host of the given device
-            # that are 'up'. All daemons on the same host can deliver
-            # SMART data, thus it is not relevant for us which daemon
-            # we are using.
-            daemons = list(set(daemons) & set(osd_daemons_up))  # type: ignore
-
-            for daemon in daemons:
+            # All daemons on the same host can deliver SMART data,
+            # thus it is not relevant for us which daemon we are using.
+            # NOTE: the list may contain daemons that are 'down' or 'destroyed'.
+            for daemon in device['daemons']:
                 svc_type, svc_id = daemon.split('.')
                 if 'osd' in svc_type:
+                    if daemon not in osd_daemons_up:
+                        continue
                     try:
                         dev_smart_data = CephService.send_command(
                             svc_type, 'smart', svc_id, devid=device['devid'])
-                    except SendCommandError:
+                    except SendCommandError as error:
+                        logger.warning(str(error))
                         # Try to retrieve SMART data from another daemon.
                         continue
                 elif 'mon' in svc_type:
                     try:
                         dev_smart_data = CephService.send_command(
                             svc_type, 'device query-daemon-health-metrics', who=daemon)
-                    except SendCommandError:
+                    except SendCommandError as error:
+                        logger.warning(str(error))
                         # Try to retrieve SMART data from another daemon.
                         continue
                 else:
@@ -280,10 +276,7 @@ class CephService(object):
                             '[SMART] Error retrieving smartctl data for device ID "%s": %s',
                             dev_id, dev_data)
                 break
-            if dev_smart_data is None:
-                raise DashboardException(
-                    'Failed to retrieve SMART data for device ID "{}"'.format(
-                        device['devid']))
+
             return dev_smart_data
         logger.warning('[SMART] No daemons associated with device ID "%s"',
                        device['devid'])
index 2ad9576af80056c755be7ce187db46e5a1929340..7d178695c8d568ce158f89b9dfdfe7f43ece8e39 100644 (file)
@@ -109,23 +109,61 @@ def test_get_smart_data(caplog, by, args, log):
 
 
 @mock.patch.object(CephService, 'send_command')
-def test_get_smart_data_from_appropriate_ceph_command(send_command):
+def test_get_smart_data_by_device(send_command):
     # pylint: disable=protected-access
-    send_command.side_effect = [
-        {'nodes': [{'name': 'osd.1', 'status': 'up'}, {'name': 'mon.1', 'status': 'down'}]},
-        {'fake': {'device': {'name': '/dev/sda'}}}
-    ]
-    CephService._get_smart_data_by_device({'devid': '1', 'daemons': ['osd.1', 'mon.1']})
+    device_id = 'Hitachi_HUA72201_JPW9K0N20D22SE'
+    osd_tree_payload = {'nodes':
+                        [
+                            {'name': 'osd.1', 'status': 'down'},
+                            {'name': 'osd.2', 'status': 'up'},
+                            {'name': 'osd.3', 'status': 'up'}
+                        ]}
+    health_metrics_payload = {device_id: {'ata_apm': {'enabled': False}}}
+    side_effect = [osd_tree_payload, health_metrics_payload]
+
+    # Daemons associated: 1 osd down, 2 osd up.
+    send_command.side_effect = side_effect
+    smart_data = CephService._get_smart_data_by_device(
+        {'devid': device_id, 'daemons': ['osd.1', 'osd.2', 'osd.3']})
+    assert smart_data == health_metrics_payload
+    send_command.assert_has_calls([mock.call('mon', 'osd tree'),
+                                   mock.call('osd', 'smart', '2', devid=device_id)])
+
+    # Daemons associated: 1 osd down.
+    send_command.reset_mock()
+    send_command.side_effect = [osd_tree_payload]
+    smart_data = CephService._get_smart_data_by_device({'devid': device_id, 'daemons': ['osd.1']})
+    assert smart_data == {}
+    send_command.assert_has_calls([mock.call('mon', 'osd tree')])
+
+    # Daemons associated: 1 osd down, 1 mon.
+    send_command.reset_mock()
+    send_command.side_effect = side_effect
+    smart_data = CephService._get_smart_data_by_device(
+        {'devid': device_id, 'daemons': ['osd.1', 'mon.1']})
+    assert smart_data == health_metrics_payload
     send_command.assert_has_calls([mock.call('mon', 'osd tree'),
-                                   mock.call('osd', 'smart', '1', devid='1')])
+                                   mock.call('mon', 'device query-daemon-health-metrics',
+                                             who='mon.1')])
 
-    send_command.side_effect = [
-        {'nodes': [{'name': 'osd.1', 'status': 'down'}, {'name': 'mon.1', 'status': 'up'}]},
-        {'fake': {'device': {'name': '/dev/sda'}}}
-    ]
-    CephService._get_smart_data_by_device({'devid': '1', 'daemons': ['osd.1', 'mon.1']})
+    # Daemons associated: 1 mon.
+    send_command.reset_mock()
+    send_command.side_effect = side_effect
+    smart_data = CephService._get_smart_data_by_device({'devid': device_id, 'daemons': ['mon.1']})
+    assert smart_data == health_metrics_payload
     send_command.assert_has_calls([mock.call('mon', 'osd tree'),
-                                   mock.call('osd', 'smart', '1', devid='1'),
-                                   mock.call('mon', 'osd tree'),
                                    mock.call('mon', 'device query-daemon-health-metrics',
                                              who='mon.1')])
+
+    # Daemons associated: 1 other (non-osd, non-mon).
+    send_command.reset_mock()
+    send_command.side_effect = [osd_tree_payload]
+    smart_data = CephService._get_smart_data_by_device({'devid': device_id, 'daemons': ['rgw.1']})
+    assert smart_data == {}
+    send_command.assert_has_calls([mock.call('mon', 'osd tree')])
+
+    # Daemons associated: no daemons.
+    send_command.reset_mock()
+    smart_data = CephService._get_smart_data_by_device({'devid': device_id, 'daemons': []})
+    assert smart_data == {}
+    send_command.assert_has_calls([])