]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
mgr/telemetry: handle empty device report when "send" is triggered 44994/head
authorLaura Flores <lflores@redhat.com>
Fri, 11 Feb 2022 19:37:26 +0000 (19:37 +0000)
committerLaura Flores <lflores@redhat.com>
Wed, 16 Feb 2022 19:38:53 +0000 (19:38 +0000)
On certain environments, such as the "ceph-dev-docker" environment
(https://github.com/ricardoasmarques/ceph-dev-docker), the mgr
module is unable to fetch device metrics. As a result, the device
report generated by "gather_device_report()" returns an empty dict.
This causes an AssertionError when the "send" function is triggered
(i.e. by running `ceph telemetry status` or `ceph telemetry send`),
and the module crashes.

The fix in this commit checks that the generated device report
contains metrics before trying to send it. If the device report
does not contain metrics (it returns an empty dict), the module
will log an appropriate message in the mgr log and not send the
device report.

If this scenario happens when running the `ceph telemetry send` command,
the user will additionally see this message:
```
Ceph report sent to https://telemetry.ceph.com/report
Unable to send device report: channel is on, but generated report was empty.
```

I also added a few more debug messages in gather_device_report() to make
future debugging easier.

Fixes: https://tracker.ceph.com/issues/54250
Signed-off-by: Laura Flores <lflores@redhat.com>
src/pybind/mgr/telemetry/module.py

index 45baa1a5cc5ca718d9a6cef7fa2cde00ad6986ab..fd8cd11633d7731249749c34f5f80fcefc9d8b81 100644 (file)
@@ -756,12 +756,16 @@ class Module(MgrModule):
     def gather_device_report(self) -> Dict[str, Dict[str, Dict[str, str]]]:
         try:
             time_format = self.remote('devicehealth', 'get_time_format')
-        except Exception:
+        except Exception as e:
+            self.log.debug('Unable to format time: {}'.format(e))
             return {}
         cutoff = datetime.utcnow() - timedelta(hours=self.interval * 2)
         min_sample = cutoff.strftime(time_format)
 
         devices = self.get('devices')['devices']
+        if not devices:
+            self.log.debug('Unable to get device info from the mgr.')
+            return {}
 
         # anon-host-id -> anon-devid -> { timestamp -> record }
         res: Dict[str, Dict[str, Dict[str, str]]] = {}
@@ -771,13 +775,15 @@ class Module(MgrModule):
                 # this is a map of stamp -> {device info}
                 m = self.remote('devicehealth', 'get_recent_device_metrics',
                                 devid, min_sample)
-            except Exception:
+            except Exception as e:
+                self.log.debug('Unable to get recent metrics from device with id "{}": {}'.format(devid, e))
                 continue
 
             # anonymize host id
             try:
                 host = d['location'][0]['host']
-            except (KeyError, IndexError):
+            except (KeyError, IndexError) as e:
+                self.log.debug('Unable to get host from device with id "{}": {}'.format(devid, e))
                 continue
             anon_host = self.get_store('host-id/%s' % host)
             if not anon_host:
@@ -1268,23 +1274,27 @@ class Module(MgrModule):
             elif e == self.EndPoint.device:
                 if 'device' in self.get_active_channels():
                     devices = self.gather_device_report()
-                    assert devices
-                    num_devs = 0
-                    num_hosts = 0
-                    for host, ls in devices.items():
-                        self.log.debug('host %s devices %s' % (host, ls))
-                        if not len(ls):
-                            continue
-                        fail_reason = self._try_post('devices', self.device_url,
-                                                     ls)
-                        if fail_reason:
-                            failed.append(fail_reason)
-                        else:
-                            num_devs += len(ls)
-                            num_hosts += 1
-                    if num_devs:
-                        success.append('Reported %d devices across %d hosts' % (
-                            num_devs, len(devices)))
+                    if devices:
+                        num_devs = 0
+                        num_hosts = 0
+                        for host, ls in devices.items():
+                            self.log.debug('host %s devices %s' % (host, ls))
+                            if not len(ls):
+                                continue
+                            fail_reason = self._try_post('devices', self.device_url,
+                                                         ls)
+                            if fail_reason:
+                                failed.append(fail_reason)
+                            else:
+                                num_devs += len(ls)
+                                num_hosts += 1
+                        if num_devs:
+                            success.append('Reported %d devices from %d hosts across a total of %d hosts' % (
+                                num_devs, num_hosts, len(devices)))
+                    else:
+                        fail_reason = 'Unable to send device report: Device channel is on, but the generated report was empty.'
+                        failed.append(fail_reason)
+                        self.log.error(fail_reason)
         if failed:
             return 1, '', '\n'.join(success + failed)
         return 0, '', '\n'.join(success)