]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
mgr/prometheus: Clean up collection thread
authorBoris Ranto <branto@redhat.com>
Wed, 25 Nov 2020 09:25:49 +0000 (10:25 +0100)
committerBoris Ranto <branto@redhat.com>
Mon, 30 Nov 2020 10:28:08 +0000 (11:28 +0100)
We need to clean up the metrics collection thread.

Signed-off-by: Boris Ranto <branto@redhat.com>
(cherry picked from commit 03fcaccafc877d10a894b1c39af5547f172c1ed3)

Conflicts:
prometheus/module.py: Pass _global_instance as an argument to
        MetricCollectionThread, collect can't be a static function
        anymore

src/pybind/mgr/prometheus/module.py

index 25b36c331c6ae19d27aabeded683cfe2bf509797..3f2bafd8138dfca62c294413693fd10264ef7403 100644 (file)
@@ -178,42 +178,46 @@ class Metric(object):
 
 
 class MetricCollectionThread(threading.Thread):
-    def __init__(self):
+    def __init__(self, module):
+        # type: (Module) -> None
+        self.mod = module
+        self.active = True
         super(MetricCollectionThread, self).__init__(target=self.collect)
 
-    @staticmethod
-    def collect():
-        inst = _global_instance
-        inst.log.info('starting metric collection thread')
-        while True:
-            if inst.have_mon_connection():
+    def collect(self):
+        self.mod.log.info('starting metric collection thread')
+        while self.active:
+            self.mod.log.debug('collecting cache in thread')
+            if self.mod.have_mon_connection():
                 start_time = time.time()
-                data = inst.collect()
+                data = self.mod.collect()
                 duration = time.time() - start_time
                 
-                sleep_time = inst.scrape_interval - duration
+                sleep_time = self.mod.scrape_interval - duration
                 if sleep_time < 0:
-                    inst.log.warning(
+                    self.mod.log.warning(
                         'Collecting data took more time than configured scrape interval. '
                         'This possibly results in stale data. Please check the '
                         '`stale_cache_strategy` configuration option. '
                         'Collecting data took {:.2f} seconds but scrape interval is configured '
                         'to be {:.0f} seconds.'.format(
                             duration,
-                            inst.scrape_interval,
+                            self.mod.scrape_interval,
                         )
                     )
                     sleep_time = 0
 
-                with inst.collect_lock:
-                    inst.collect_cache = data
-                    inst.collect_time = duration
+                with self.mod.collect_lock:
+                    self.mod.collect_cache = data
+                    self.mod.collect_time = duration
 
                 time.sleep(sleep_time)
             else:
-                inst.log.error('No MON connection')
-                time.sleep(inst.scrape_interval)
+                self.mod.log.error('No MON connection')
+                time.sleep(self.mod.scrape_interval)
 
+    def stop(self):
+        self.active = False
 
 class Module(MgrModule):
     COMMANDS = [
@@ -265,7 +269,7 @@ class Module(MgrModule):
         }
         global _global_instance
         _global_instance = self
-        MetricCollectionThread().start()
+        self.metrics_thread = MetricCollectionThread(_global_instance)
 
     def _setup_static_metrics(self):
         metrics = {}
@@ -1169,6 +1173,8 @@ class Module(MgrModule):
             (server_addr, server_port)
         )
 
+        self.metrics_thread.start()
+
         # Publish the URI that others may use to access the service we're
         # about to start serving
         self.set_uri('http://{0}:{1}/'.format(
@@ -1188,9 +1194,13 @@ class Module(MgrModule):
         # wait for the shutdown event
         self.shutdown_event.wait()
         self.shutdown_event.clear()
+        # tell metrics collection thread to stop collecting new metrics
+        self.metrics_thread.stop()
         cherrypy.engine.stop()
         self.log.info('Engine stopped.')
         self.shutdown_rbd_stats()
+        # wait for the metrics collection thread to stop
+        self.metrics_thread.join()
 
     def shutdown(self):
         self.log.info('Stopping engine...')