]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
mgr/node-proxy: handle 'None' statuses returned by RedFish 55955/head
authorGuillaume Abrioux <gabrioux@ibm.com>
Tue, 5 Mar 2024 10:05:18 +0000 (10:05 +0000)
committerGuillaume Abrioux <gabrioux@redhat.com>
Wed, 6 Mar 2024 09:09:34 +0000 (09:09 +0000)
Looks like RedFish might return 'None' values for some attributes.

for instance:

```
[root@ceph-node-01 ~]# curl -s -k -X GET https://169.254.1.1/redfish/v1/Systems/System.Embedded.1/Storage/AHCI.SL.6-1/Drives/Disk.Direct.0-0:AHCI.SL.6-1 -H "X-Auth-Token: 3264251c28191fa5e7c9ebec49ef90fc"  | jq .Status
{
  "Health": "OK",
  "HealthRollup": "OK",
  "State": "Enabled"
}
[root@ceph-node-01 ~]# curl -s -k -X GET https://169.254.1.1/redfish/v1/Systems/System.Embedded.1/Storage/NonRAID.Slot.2-1/Drives/Disk.Bay.0:Enclosure.Internal.0-1:NonRAID.Slot.2-1 -H "X-Auth-Token: 3264251c28191fa5e7c9ebec49ef90fc" | jq .Status
{
  "Health": null,
  "HealthRollup": null,
  "State": "Enabled"
}
[root@ceph-node-01 ~]#
```

Although this seems to be a bug from RedFish, we need to handle
the case when it happens otherwise it makes the mgr orchestrator module
throw an error.

The idea here is to create a new status "unknown" when we can't fetch the
real status of a component.

Fixes: https://tracker.ceph.com/issues/64712
Signed-off-by: Guillaume Abrioux <gabrioux@ibm.com>
src/ceph-node-proxy/ceph_node_proxy/util.py
src/pybind/mgr/cephadm/inventory.py

index f6ed0fb483d6d113baefb517bbee5c8f45340ab8..677161c63fd310769e7291557a7fd1e0e0f6c50c 100644 (file)
@@ -126,6 +126,8 @@ def normalize_dict(test_dict: Dict) -> Dict:
         if isinstance(test_dict[key], dict):
             res[key.lower()] = normalize_dict(test_dict[key])
         else:
+            if test_dict[key] is None:
+                test_dict[key] = 'unknown'
             res[key.lower()] = test_dict[key]
     return res
 
index 235737ef10e763b4f2f572b4f164e2efd92d4c42..966ffc0461c850ae94e01434bcab05502aa75b04 100644 (file)
@@ -8,7 +8,7 @@ import logging
 import math
 import socket
 from typing import TYPE_CHECKING, Dict, List, Iterator, Optional, Any, Tuple, Set, Mapping, cast, \
-    NamedTuple, Type
+    NamedTuple, Type, ValuesView
 
 import orchestrator
 from ceph.deployment import inventory
@@ -1485,10 +1485,12 @@ class NodeProxyCache:
         """
         hostname = kw.get('hostname')
         hosts = [hostname] if hostname else self.data.keys()
-        mapper: Dict[bool, str] = {
-            True: 'error',
-            False: 'ok'
-        }
+
+        def is_unknown(statuses: ValuesView) -> bool:
+            return any([status['status']['health'].lower() == 'unknown' for status in statuses]) and not is_error(statuses)
+
+        def is_error(statuses: ValuesView) -> bool:
+            return any([status['status']['health'].lower() == 'error' for status in statuses])
 
         _result: Dict[str, Any] = {}
 
@@ -1496,9 +1498,15 @@ class NodeProxyCache:
             _result[host] = {}
             _result[host]['status'] = {}
             data = self.data[host]
-            for component, details in data['status'].items():
-                res = any([member['status']['health'].lower() != 'ok' for member in data['status'][component].values()])
-                _result[host]['status'][component] = mapper[res]
+            for component in data['status'].keys():
+                values = data['status'][component].values()
+                if is_error(values):
+                    state = 'error'
+                elif is_unknown(values):
+                    state = 'unknown'
+                else:
+                    state = 'ok'
+                _result[host]['status'][component] = state
             _result[host]['sn'] = data['sn']
             _result[host]['host'] = data['host']
             _result[host]['firmwares'] = data['firmwares']