]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
node-proxy: catch RequestException in reporter
authorGuillaume Abrioux <gabrioux@ibm.com>
Wed, 7 Jun 2023 12:23:57 +0000 (14:23 +0200)
committerGuillaume Abrioux <gabrioux@ibm.com>
Thu, 25 Jan 2024 14:43:30 +0000 (14:43 +0000)
This catches the requests.exceptions.RequestException
exception in the reporter agent so we can better handle the
case where it can't reach the endpoint when trying to send the
collected data.
Before this change, if for some reason the refreshed data couldn't be
sent to the endpoint, it wouldn't have retried because
`self.system.previous_data` was overwritten anyway.

Signed-off-by: Guillaume Abrioux <gabrioux@ibm.com>
src/cephadm/node-proxy/reporter.py

index a454b8a5bb7400c0496ff50775fbb0f5fd943c5b..c20195b535d4d2766dbf980836dfc97b82d1c51a 100644 (file)
@@ -28,17 +28,21 @@ class Reporter:
             # dense clusters
             if self.system.data_ready:
                 log.debug("waiting for a lock.")
-                try:
-                    self.system.lock.acquire()
-                    log.debug("lock acquired.")
-                    if not self.system.get_system() == self.system.previous_data:
-                        self.system.previous_data = self.system.get_system()
-                        log.info('data has changed since last iteration.')
-                        d = self.system.get_system()
+                self.system.lock.acquire()
+                log.debug("lock acquired.")
+                if not self.system.get_system() == self.system.previous_data:
+                    log.info('data has changed since last iteration.')
+                    d = self.system.get_system()
+                    try:
                         requests.post(f"{self.observer_url}/fake_endpoint", json=d)
+                    except requests.exceptions.RequestException as e:
+                        log.error(f"The reporter couldn't send data to the mgr: {e}")
+                        # Need to add a new parameter 'max_retries' to the reporter if it can't
+                        # send the data for more than x times, maybe the daemon should stop altogether
                     else:
-                        log.info('no diff, not sending data to the mgr.')
-                finally:
-                    self.system.lock.release()
-                    log.debug("lock released.")
-            time.sleep(20)
+                        self.system.previous_data = self.system.get_system()
+                else:
+                    log.info('no diff, not sending data to the mgr.')
+                self.system.lock.release()
+                log.debug("lock released.")
+            time.sleep(5)