From: Patrick Seidensal Date: Tue, 9 Jun 2020 10:35:00 +0000 (+0200) Subject: mgr/prometheus: improve Prometheus module cache X-Git-Tag: v14.2.11~53^2 X-Git-Url: http://git.apps.os.sepia.ceph.com/?a=commitdiff_plain;h=58078d7a49da5a9b2b7069fa47edcc4c18ac320d;p=ceph.git mgr/prometheus: improve Prometheus module cache Improve cache by running requests in a thread and prevent multiple requests to Ceph from multiple sources (e.g. Prometheus instances) which increase load on the manager. Fixes: https://tracker.ceph.com/issues/45554 Signed-off-by: Patrick Seidensal (cherry picked from commit fa69d8e1112d2688e2979ab10c303d1facb6bc76) Conflicts: src/pybind/mgr/prometheus/module.py - line 1096 if condition block changed - second commit "mgr/prometheus: enable mypy type checking for prometheus module" is not required for the backport as discussed with Patrick who is the creator of the original PR. --- diff --git a/doc/mgr/prometheus.rst b/doc/mgr/prometheus.rst index 8dbc44f5292ac..87296be391471 100644 --- a/doc/mgr/prometheus.rst +++ b/doc/mgr/prometheus.rst @@ -25,11 +25,65 @@ The *prometheus* module is enabled with:: Configuration ------------- -By default the module will accept HTTP requests on port ``9283`` on all -IPv4 and IPv6 addresses on the host. The port and listen address are both +.. note:: + + The Prometheus manager module needs to be restarted for configuration changes to be applied. + +By default the module will accept HTTP requests on port ``9283`` on all IPv4 +and IPv6 addresses on the host. The port and listen address are both configurable with ``ceph config-key set``, with keys -``mgr/prometheus/server_addr`` and ``mgr/prometheus/server_port``. -This port is registered with Prometheus's `registry `_. +``mgr/prometheus/server_addr`` and ``mgr/prometheus/server_port``. This port +is registered with Prometheus's `registry +`_. + +:: + + ceph config set mgr mgr/prometheus/server_addr 0.0.0.0 + ceph config set mgr mgr/prometheus/server_port 9283 + +.. warning:: + + The ``scrape_interval`` of this module should always be set to match + Prometheus' scrape interval to work properly and not cause any issues. + +The Prometheus manager module is, by default, configured with a scrape interval +of 15 seconds. The scrape interval in the module is used for caching purposes +and to determine when a cache is stale. + +It is not recommended to use a scrape interval below 10 seconds. It is +recommended to use 15 seconds as scrape interval, though, in some cases it +might be useful to increase the scrape interval. + +To set a different scrape interval in the Prometheus module, set +``scrape_interval`` to the desired value:: + + ceph config set mgr mgr/prometheus/scrape_interval 20 + +On large clusters (>1000 OSDs), the time to fetch the metrics may become +significant. Without the cache, the Prometheus manager module could, +especially in conjunction with multiple Prometheus instances, overload the +manager and lead to unresponsive or crashing Ceph manager instances. Hence, +the cache is enabled by default and cannot be disabled. This means that there +is a possibility that the cache becomes stale. The cache is considered stale +when the time to fetch the metrics from Ceph exceeds the configured +``scrape_interval``. + +If that is the case, **a warning will be logged** and the module will either + +* respond with a 503 HTTP status code (service unavailable) or, +* it will return the content of the cache, even though it might be stale. + +This behavior can be configured. By default, it will return a 503 HTTP status +code (service unavailable). You can set other options using the ``ceph config +set`` commands. + +To tell the module to respond with possibly stale data, set it to ``return``:: + + ceph config set mgr mgr/prometheus/stale_cache_strategy return + +To tell the module to respond with "service unavailable", set it to ``fail``:: + + ceph config set mgr mgr/prometheus/stale_cache_strategy fail .. _prometheus-rbd-io-statistics: @@ -62,7 +116,7 @@ Statistic names and labels ========================== The names of the stats are exactly as Ceph names them, with -illegal characters ``.``, ``-`` and ``::`` translated to ``_``, +illegal characters ``.``, ``-`` and ``::`` translated to ``_``, and ``ceph_`` prefixed to all names. @@ -75,7 +129,7 @@ rocksdb stats. The *cluster* statistics (i.e. those global to the Ceph cluster) -have labels appropriate to what they report on. For example, +have labels appropriate to what they report on. For example, metrics relating to pools have a ``pool_id`` label. @@ -109,7 +163,7 @@ Correlating drive statistics with node_exporter The prometheus output from Ceph is designed to be used in conjunction with the generic host monitoring from the Prometheus node_exporter. -To enable correlation of Ceph OSD statistics with node_exporter's +To enable correlation of Ceph OSD statistics with node_exporter's drive statistics, special series are output like this: :: diff --git a/src/pybind/mgr/prometheus/module.py b/src/pybind/mgr/prometheus/module.py index 3a11a362c9abe..54c1c265a6f65 100644 --- a/src/pybind/mgr/prometheus/module.py +++ b/src/pybind/mgr/prometheus/module.py @@ -12,6 +12,11 @@ from mgr_module import MgrModule, MgrStandbyModule, CommandResult, PG_STATES from mgr_util import get_default_addr from rbd import RBD +try: + from typing import Optional +except: + pass + # Defaults for the Prometheus HTTP server. Can also set in config-key # see https://github.com/prometheus/prometheus/wiki/Default-port-allocations # for Prometheus exporter port registry @@ -42,12 +47,7 @@ os._exit = os_exit_noop # to access things in class Module from subclass Root. Because # it's a dict, the writer doesn't need to declare 'global' for access -_global_instance = {'plugin': None} - - -def global_instance(): - assert _global_instance['plugin'] is not None - return _global_instance['plugin'] +_global_instance = None # type: Optional[Module] def health_status_to_number(status): @@ -176,6 +176,44 @@ class Metric(object): return expfmt +class MetricCollectionThread(threading.Thread): + def __init__(self): + super(MetricCollectionThread, self).__init__(target=self.collect) + + @staticmethod + def collect(): + inst = _global_instance + inst.log.info('starting metric collection thread') + while True: + if inst.have_mon_connection(): + start_time = time.time() + data = inst.collect() + duration = time.time() - start_time + + sleep_time = inst.scrape_interval - duration + if sleep_time < 0: + inst.log.warning( + 'Collecting data took more time than configured scrape interval. ' + 'This possibly results in stale data. Please check the ' + '`stale_cache_strategy` configuration option. ' + 'Collecting data took {:.2f} seconds but scrape interval is configured ' + 'to be {:.0f} seconds.'.format( + duration, + inst.scrape_interval, + ) + ) + sleep_time = 0 + + with inst.collect_lock: + inst.collect_cache = data + inst.collect_time = duration + + time.sleep(sleep_time) + else: + inst.log.error('No MON connection') + time.sleep(inst.scrape_interval) + + class Module(MgrModule): COMMANDS = [ { @@ -189,17 +227,22 @@ class Module(MgrModule): {'name': 'server_addr'}, {'name': 'server_port'}, {'name': 'scrape_interval'}, + {'name': 'stale_cache_strategy'}, {'name': 'rbd_stats_pools'}, {'name': 'rbd_stats_pools_refresh_interval', 'type': 'int', 'default': 300}, ] + STALE_CACHE_FAIL = 'fail' + STALE_CACHE_RETURN = 'return' + def __init__(self, *args, **kwargs): super(Module, self).__init__(*args, **kwargs) self.metrics = self._setup_static_metrics() self.shutdown_event = threading.Event() - self.collect_lock = threading.RLock() + self.collect_lock = threading.Lock() self.collect_time = 0 - self.collect_timeout = 5.0 + self.scrape_interval = 15.0 + self.stale_cache_strategy = self.STALE_CACHE_FAIL self.collect_cache = None self.rbd_stats = { 'pools': {}, @@ -219,7 +262,9 @@ class Module(MgrModule): 'desc': 'RBD image reads latency (msec)'}, }, } - _global_instance['plugin'] = self + global _global_instance + _global_instance = self + MetricCollectionThread().start() def _setup_static_metrics(self): metrics = {} @@ -986,7 +1031,7 @@ class Module(MgrModule): # TODO use get_config_prefix or get_config here once # https://github.com/ceph/ceph/pull/20458 is merged result = CommandResult("") - global_instance().send_command( + _global_instance.send_command( result, "mon", '', json.dumps({ "prefix": "config-key get", @@ -995,7 +1040,7 @@ class Module(MgrModule): "") r, outb, outs = result.wait() if r != 0: - global_instance().log.error("Failed to retrieve port for mgr {}: {}".format(id_, outs)) + _global_instance.log.error("Failed to retrieve port for mgr {}: {}".format(id_, outs)) targets.append('{}:{}'.format(hostname, DEFAULT_PORT)) else: port = json.loads(outb) @@ -1042,33 +1087,54 @@ class Module(MgrModule): @cherrypy.expose def metrics(self): - instance = global_instance() # Lock the function execution - try: - instance.collect_lock.acquire() - return self._metrics(instance) - finally: - instance.collect_lock.release() + with _global_instance.collect_lock: + return self._metrics(_global_instance) @staticmethod def _metrics(instance): - # Return cached data if available and collected before the cache times out - if instance.collect_cache and time.time() - instance.collect_time < instance.collect_timeout: - cherrypy.response.headers['Content-Type'] = 'text/plain' - return instance.collect_cache + # Return cached data if available + if not instance.collect_cache: + raise cherrypy.HTTPError(503, 'No cached data available yet') - if instance.have_mon_connection(): - instance.collect_cache = None - instance.collect_time = time.time() - instance.collect_cache = instance.collect() + def respond(): cherrypy.response.headers['Content-Type'] = 'text/plain' return instance.collect_cache - else: - raise cherrypy.HTTPError(503, 'No MON connection') + + if instance.collect_time < instance.scrape_interval: + # Respond if cache isn't stale + return respond() + + if instance.stale_cache_strategy == instance.STALE_CACHE_RETURN: + # Respond even if cache is stale + instance.log.info( + 'Gathering data took {:.2f} seconds, metrics are stale for {:.2f} seconds, ' + 'returning metrics from stale cache.'.format( + instance.collect_time, + instance.collect_time - instance.scrape_interval + ) + ) + return respond() + + if instance.stale_cache_strategy == instance.STALE_CACHE_FAIL: + # Fail if cache is stale + msg = ( + 'Gathering data took {:.2f} seconds, metrics are stale for {:.2f} seconds, ' + 'returning "service unavailable".'.format( + instance.collect_time, + instance.collect_time - instance.scrape_interval, + ) + ) + instance.log.error(msg) + raise cherrypy.HTTPError(503, msg) # Make the cache timeout for collecting configurable - self.collect_timeout = float(self.get_localized_module_option( - 'scrape_interval', 5.0)) + self.scrape_interval = float(self.get_localized_module_option('scrape_interval', 15.0)) + + self.stale_cache_strategy = self.get_localized_module_option('stale_cache_strategy', 'log') + if self.stale_cache_strategy not in [self.STALE_CACHE_FAIL, + self.STALE_CACHE_RETURN]: + self.stale_cache_strategy = self.STALE_CACHE_FAIL server_addr = self.get_localized_module_option( 'server_addr', get_default_addr())