import time
import enum
from collections import namedtuple
+from collections import OrderedDict
from tempfile import NamedTemporaryFile
from mgr_module import CLIReadCommand, MgrModule, MgrStandbyModule, PG_STATES, Option, ServiceInfoT, HandleCommandResult, CLIWriteCommand
DEFAULT_PORT = 9283
-
-# cherrypy likes to sys.exit on error. don't let it take us down too!
-def os_exit_noop(status: int) -> None:
- pass
-
-
-os._exit = os_exit_noop # type: ignore
-
# to access things in class Module from subclass Root. Because
# it's a dict, the writer doesn't need to declare 'global' for access
"""Return the instance as a dictionary."""
return self.__dict__
+class LRUCacheDict(OrderedDict):
+ def __init__(self, maxsize, *args, **kwargs):
+ self.maxsize = maxsize
+ super().__init__(*args, **kwargs)
+
+ def __setitem__(self, key, value):
+ if key in self:
+ self.move_to_end(key)
+ elif len(self) >= self.maxsize:
+ self.popitem(last=False) # drop oldest
+ super().__setitem__(key, value)
class HealthHistory:
kv_name = 'health_history'
def __init__(self, mgr: MgrModule):
self.mgr = mgr
self.lock = threading.Lock()
- self.healthcheck: Dict[str, HealthCheckEvent] = {}
+ self.max_entries = cast(int, self.mgr.get_localized_module_option('healthcheck_history_max_entries', 1000))
+ self.healthcheck: LRUCacheDict[str, HealthCheckEvent] = LRUCacheDict(maxsize=self.max_entries)
self._load()
def _load(self) -> None:
"""Reset the healthcheck history."""
with self.lock:
self.mgr.set_store(self.kv_name, "{}")
- self.healthcheck = {}
+ self.healthcheck.clear()
def save(self) -> None:
"""Save the current in-memory healthcheck history to the KV store."""
"""
current_checks = health_checks.get('checks', {})
- changes_made = False
-
- # first turn off any active states we're tracking
- for seen_check in self.healthcheck:
- check = self.healthcheck[seen_check]
- if check.active and seen_check not in current_checks:
- check.active = False
- changes_made = True
-
- # now look for any additions to track
now = time.time()
- for name, info in current_checks.items():
- if name not in self.healthcheck:
- # this healthcheck is new, so start tracking it
- changes_made = True
- self.healthcheck[name] = HealthCheckEvent(
- name=name,
- severity=info.get('severity'),
- first_seen=now,
- last_seen=now,
- count=1,
- active=True
- )
- else:
- # seen it before, so update its metadata
- check = self.healthcheck[name]
- if check.active:
- # check has been registered as active already, so skip
+ with self.lock:
+ changes_made = False
+ names = set(self.healthcheck) | set(current_checks)
+
+ for name in names:
+ present = name in current_checks
+ check = self.healthcheck.get(name)
+ if check is None:
+ if present:
+ info = current_checks[name]
+ self.healthcheck[name] = HealthCheckEvent(
+ name=name,
+ severity=info.get('severity'),
+ first_seen=now,
+ last_seen=now,
+ count=1,
+ active=True
+ )
+ changes_made = True
+
continue
- else:
+
+ if present:
+ if not check.active:
+ check.count += 1
+ changes_made = True
+
check.last_seen = now
- check.count += 1
check.active = True
- changes_made = True
+ else:
+ if check.active:
+ check.active = False
+ changes_made = True
- if changes_made:
- self.save()
+ if changes_made:
+ self.save()
def __str__(self) -> str:
"""Print the healthcheck history.
str: Human readable representation of the healthcheck history
"""
out = []
-
if len(self.healthcheck.keys()) == 0:
out.append("No healthchecks have been recorded")
else:
desc='Do not include perf-counters in the metrics output',
long_desc='Gathering perf-counters from a single Prometheus exporter can degrade ceph-mgr performance, especially in large clusters. Instead, Ceph-exporter daemons are now used by default for perf-counter gathering. This should only be disabled when no ceph-exporters are deployed.',
runtime=True
- )
+ ),
+ Option(
+ name='healthcheck_history_max_entries',
+ type='int',
+ default=1000,
+ desc='Maximum number of health check history entries to keep',
+ runtime=True
+ ),
]
STALE_CACHE_FAIL = 'fail'
self.setup_tls_config(server_addr, server_port)
return
except Exception as e:
- self.log.exception(f'Failed to setup cephadm based secure monitoring stack: {e}\n',
- 'Falling back to default configuration')
+ self.log.exception(
+ f'Failed to setup cephadm based secure monitoring stack: {e}\n'
+ 'Falling back to default configuration'
+ )
# In any error fallback to plain http mode
self.setup_default_config(server_addr, server_port)
'server.ssl_module': None,
'server.ssl_certificate': None,
'server.ssl_private_key': None,
+ 'tools.gzip.on': True,
+ 'tools.gzip.mime_types': [
+ 'text/plain',
+ 'text/html',
+ 'application/json',
+ ],
+ 'tools.gzip.compress_level': 6,
})
# Publish the URI that others may use to access the service we're about to start serving
self.set_uri(build_url(scheme='http', host=self.get_server_addr(),
'server.ssl_module': 'builtin',
'server.ssl_certificate': cert_file_path,
'server.ssl_private_key': key_file_path,
+ 'tools.gzip.on': True,
+ 'tools.gzip.mime_types': [
+ 'text/plain',
+ 'text/html',
+ 'application/json',
+ ],
+ 'tools.gzip.compress_level': 6,
})
# Publish the URI that others may use to access the service we're about to start serving
self.set_uri(build_url(scheme='https', host=self.get_server_addr(),
@staticmethod
def _metrics(instance: 'Module') -> Optional[str]:
- if not self.cache:
- self.log.debug('Cache disabled, collecting and returning without cache')
- cherrypy.response.headers['Content-Type'] = 'text/plain'
- return self.collect()
+ if not instance.cache:
+ instance.log.debug('Cache disabled, collecting and returning without cache')
+ return instance.collect()
# Return cached data if available
if not instance.collect_cache:
def respond() -> Optional[str]:
assert isinstance(instance, Module)
- cherrypy.response.headers['Content-Type'] = 'text/plain'
return instance.collect_cache
if instance.collect_time < instance.scrape_interval:
cherrypy.tree.mount(Root(), "/")
self.log.info('Starting engine...')
- cherrypy.engine.start()
+ try:
+ cherrypy.engine.start()
+ except Exception as e:
+ self.log.error(f'Failed to start engine: {e}')
+ return
self.log.info('Engine started.')
# wait for the shutdown event
@cherrypy.expose
def metrics(self) -> str:
- cherrypy.response.headers['Content-Type'] = 'text/plain'
return ''
cherrypy.tree.mount(Root(), '/', {})