]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
mgr/telemetry: send device telemetry via per-host POST to device endpoint
authorSage Weil <sage@redhat.com>
Tue, 29 Oct 2019 14:28:18 +0000 (09:28 -0500)
committerSage Weil <sage@redhat.com>
Wed, 6 Nov 2019 12:41:50 +0000 (06:41 -0600)
We do not want to associate devices with clusters because that may
communicate unnecessary information about the association between vendors
and clusters (which, when large, are potentially identifying).

Instead, do a POST per host with all of the devices on that host only.

The devices endpoint does not log the POST time, so these per-host
records won't be associated with each other.

Signed-off-by: Sage Weil <sage@redhat.com>
(cherry picked from commit af64b18f1caa278dc0f73d4e203265f2625e35d4)

src/pybind/mgr/telemetry/module.py

index 37d7c7f1e60731ed6496e3f0ce095372440b680b..38409365a8d10b3e2fa42242f99d179eef3b3f9e 100644 (file)
@@ -69,6 +69,11 @@ class Module(MgrModule):
             'type': 'str',
             'default': 'https://telemetry.ceph.com/report'
         },
+        {
+            'name': 'device_url',
+            'type': 'str',
+            'default': 'https://telemetry.ceph.com/device'
+        },
         {
             'name': 'enabled',
             'type': 'bool',
@@ -143,7 +148,8 @@ class Module(MgrModule):
             "perm": "r"
         },
         {
-            "cmd": "telemetry send",
+            "cmd": "telemetry send "
+                   "name=endpoint,type=CephChoices,strings=ceph|device,n=N,req=false",
             "desc": "Force sending data to Ceph telemetry",
             "perm": "rw"
         },
@@ -308,10 +314,11 @@ class Module(MgrModule):
 
         devices = self.get('devices')['devices']
 
-        res = {}
+        res = {}  # anon-host-id -> anon-devid -> { timestamp -> record }
         for d in devices:
             devid = d['devid']
             try:
+                # this is a map of stamp -> {device info}
                 m = self.remote('devicehealth', 'get_recent_device_metrics',
                                 devid, min_sample)
             except:
@@ -326,7 +333,8 @@ class Module(MgrModule):
             if not anon_host:
                 anon_host = str(uuid.uuid1())
                 self.set_store('host-id/%s' % host, anon_host)
-            m['host_id'] = anon_host
+            for dev, rep in m.items():
+                rep['host_id'] = anon_host
 
             # anonymize device id
             (vendor, model, serial) = devid.split('_')
@@ -334,7 +342,6 @@ class Module(MgrModule):
             if not anon_devid:
                 anon_devid = '%s_%s_%s' % (vendor, model, uuid.uuid1())
                 self.set_store('devid-id/%s' % devid, anon_devid)
-
             self.log.info('devid %s / %s, host %s / %s' % (devid, anon_devid,
                                                            host, anon_host))
 
@@ -342,7 +349,9 @@ class Module(MgrModule):
             for k in ['serial_number']:
                 m.pop(k)
 
-            res[anon_devid] = m
+            if anon_host not in res:
+                res[anon_host] = {}
+            res[anon_host][anon_devid] = m
         return res
 
     def compile_report(self, channels=[]):
@@ -425,24 +434,75 @@ class Module(MgrModule):
         if 'crash' in channels:
             report['crashes'] = self.gather_crashinfo()
 
-        if 'device' in channels:
-            report['devices'] = self.gather_device_report()
+        # NOTE: We do not include the 'device' channel in this report; it is
+        # sent to a different endpoint.
 
         return report
 
-    def send(self, report):
-        self.log.info('Upload report to: %s', self.url)
+    def send(self, report, endpoint=None):
+        if not endpoint:
+            endpoint = ['ceph', 'device']
+        failed = []
+        success = []
         proxies = dict()
+        self.log.debug('Send endpoints %s' % endpoint)
         if self.proxy:
-            self.log.info('Using HTTP(S) proxy: %s', self.proxy)
+            self.log.info('Send using HTTP(S) proxy: %s', self.proxy)
             proxies['http'] = self.proxy
             proxies['https'] = self.proxy
-
-        resp = requests.put(url=self.url, json=report, proxies=proxies)
-        if not resp.ok:
-            self.log.error("Report send failed: %d %s %s" %
-                           (resp.status_code, resp.reason, resp.text))
-        return resp
+        for e in endpoint:
+            if e == 'ceph':
+                self.log.info('Sending ceph report to: %s', self.url)
+                resp = requests.put(url=self.url, json=report, proxies=proxies)
+                if not resp.ok:
+                    self.log.error("Report send failed: %d %s %s" %
+                                   (resp.status_code, resp.reason, resp.text))
+                    failed.append('Failed to send report to %s: %d %s %s' % (
+                        self.url,
+                        resp.status_code,
+                        resp.reason,
+                        resp.text
+                    ))
+                else:
+                    now = int(time.time())
+                    self.last_upload = now
+                    self.set_store('last_upload', str(now))
+                    success.append('Ceph report sent to {0}'.format(self.url))
+                    self.info('Sent report to {0}'.format(self.url))
+            elif e == 'device':
+                if 'device' in self.get_active_channels():
+                    self.log.info('hi')
+                    self.log.info('Sending device report to: %s',
+                                  self.device_url)
+                    devices = self.gather_device_report()
+                    num_devs = 0
+                    num_hosts = 0
+                    for host, ls in devices.items():
+                        self.log.debug('host %s devices %s' % (host, ls))
+                        if not len(ls):
+                            continue
+                        resp = requests.put(url=self.device_url, json=ls,
+                                            proxies=proxies)
+                        if not resp.ok:
+                            self.log.error(
+                                "Device report failed: %d %s %s" %
+                                (resp.status_code, resp.reason, resp.text))
+                            failed.append(
+                                'Failed to send devices to %s: %d %s %s' % (
+                                    self.device_url,
+                                    resp.status_code,
+                                    resp.reason,
+                                    resp.text
+                                ))
+                        else:
+                            num_devs += len(ls)
+                            num_hosts += 1
+                    if num_devs:
+                        success.append('Reported %d devices across %d hosts' % (
+                            num_devs, len(devices)))
+        if failed:
+            return 1, '', '\n'.join(success + failed)
+        return 0, '', '\n'.join(success)
 
     def handle_command(self, inbuf, command):
         if command['prefix'] == 'telemetry status':
@@ -462,15 +522,7 @@ class Module(MgrModule):
             return 0, '', ''
         elif command['prefix'] == 'telemetry send':
             self.last_report = self.compile_report()
-            resp = self.send(self.last_report)
-            if resp.ok:
-                return 0, 'Report sent to {0}'.format(self.url), ''
-            return 1, '', 'Failed to send report to %s: %d %s %s' % (
-                self.url,
-                resp.status_code,
-                resp.reason,
-                resp.text
-            )
+            return self.send(self.last_report, command.get('endpoint'))
 
         elif command['prefix'] == 'telemetry show':
             report = self.compile_report(
@@ -538,15 +590,7 @@ class Module(MgrModule):
                 except:
                     self.log.exception('Exception while compiling report:')
 
-                try:
-                    resp = self.send(self.last_report)
-                    # self.send logs on failure; only update last_upload
-                    # if we succeed
-                    if resp.ok:
-                        self.last_upload = now
-                        self.set_store('last_upload', str(now))
-                except:
-                    self.log.exception('Exception while sending report:')
+                self.send(self.last_report)
             else:
                 self.log.debug('Interval for sending new report has not expired')