From: Laura Flores Date: Fri, 11 Feb 2022 19:37:26 +0000 (+0000) Subject: mgr/telemetry: handle empty device report when "send" is triggered X-Git-Tag: v17.1.0~5^2 X-Git-Url: http://git.apps.os.sepia.ceph.com/?a=commitdiff_plain;h=2e775dc3385e806155ca5e61b555780e10cde6c9;p=ceph.git mgr/telemetry: handle empty device report when "send" is triggered On certain environments, such as the "ceph-dev-docker" environment (https://github.com/ricardoasmarques/ceph-dev-docker), the mgr module is unable to fetch device metrics. As a result, the device report generated by "gather_device_report()" returns an empty dict. This causes an AssertionError when the "send" function is triggered (i.e. by running `ceph telemetry status` or `ceph telemetry send`), and the module crashes. The fix in this commit checks that the generated device report contains metrics before trying to send it. If the device report does not contain metrics (it returns an empty dict), the module will log an appropriate message in the mgr log and not send the device report. If this scenario happens when running the `ceph telemetry send` command, the user will additionally see this message: ``` Ceph report sent to https://telemetry.ceph.com/report Unable to send device report: channel is on, but generated report was empty. ``` I also added a few more debug messages in gather_device_report() to make future debugging easier. Fixes: https://tracker.ceph.com/issues/54250 Signed-off-by: Laura Flores (cherry picked from commit 54e0e58f1b3f431281df0e2dd2b258f85cbade19) --- diff --git a/src/pybind/mgr/telemetry/module.py b/src/pybind/mgr/telemetry/module.py index 9245431d1ea50..f7cb29df3a275 100644 --- a/src/pybind/mgr/telemetry/module.py +++ b/src/pybind/mgr/telemetry/module.py @@ -771,12 +771,16 @@ class Module(MgrModule): def gather_device_report(self) -> Dict[str, Dict[str, Dict[str, str]]]: try: time_format = self.remote('devicehealth', 'get_time_format') - except Exception: + except Exception as e: + self.log.debug('Unable to format time: {}'.format(e)) return {} cutoff = datetime.utcnow() - timedelta(hours=self.interval * 2) min_sample = cutoff.strftime(time_format) devices = self.get('devices')['devices'] + if not devices: + self.log.debug('Unable to get device info from the mgr.') + return {} # anon-host-id -> anon-devid -> { timestamp -> record } res: Dict[str, Dict[str, Dict[str, str]]] = {} @@ -786,13 +790,15 @@ class Module(MgrModule): # this is a map of stamp -> {device info} m = self.remote('devicehealth', 'get_recent_device_metrics', devid, min_sample) - except Exception: + except Exception as e: + self.log.debug('Unable to get recent metrics from device with id "{}": {}'.format(devid, e)) continue # anonymize host id try: host = d['location'][0]['host'] - except (KeyError, IndexError): + except (KeyError, IndexError) as e: + self.log.debug('Unable to get host from device with id "{}": {}'.format(devid, e)) continue anon_host = self.get_store('host-id/%s' % host) if not anon_host: @@ -1321,23 +1327,27 @@ class Module(MgrModule): elif e == self.EndPoint.device: if 'device' in self.get_active_channels(): devices = self.gather_device_report() - assert devices - num_devs = 0 - num_hosts = 0 - for host, ls in devices.items(): - self.log.debug('host %s devices %s' % (host, ls)) - if not len(ls): - continue - fail_reason = self._try_post('devices', self.device_url, - ls) - if fail_reason: - failed.append(fail_reason) - else: - num_devs += len(ls) - num_hosts += 1 - if num_devs: - success.append('Reported %d devices across %d hosts' % ( - num_devs, len(devices))) + if devices: + num_devs = 0 + num_hosts = 0 + for host, ls in devices.items(): + self.log.debug('host %s devices %s' % (host, ls)) + if not len(ls): + continue + fail_reason = self._try_post('devices', self.device_url, + ls) + if fail_reason: + failed.append(fail_reason) + else: + num_devs += len(ls) + num_hosts += 1 + if num_devs: + success.append('Reported %d devices from %d hosts across a total of %d hosts' % ( + num_devs, num_hosts, len(devices))) + else: + fail_reason = 'Unable to send device report: Device channel is on, but the generated report was empty.' + failed.append(fail_reason) + self.log.error(fail_reason) if failed: return 1, '', '\n'.join(success + failed) return 0, '', '\n'.join(success)