From 91dd03fd648d25773a83fdad311b62b781619fc4 Mon Sep 17 00:00:00 2001 From: Adam King Date: Mon, 22 Aug 2022 11:14:12 -0400 Subject: [PATCH] mgr/cephadm: allow setting prometheus retention time When we deploy Prometheus server, we don't provide any ability to define the tsdb retention time - so it defaults to 15d. This change adds a field that can be passed in a prometheus service spec that will be passed as an arg to the --storage.tsdb.retention.time parameter for the prometheus daemon. Fixes: https://tracker.ceph.com/issues/54308 Signed-off-by: Adam King --- src/cephadm/cephadm | 3 ++ src/cephadm/tests/test_cephadm.py | 1 + src/pybind/mgr/cephadm/services/monitoring.py | 15 ++++++++-- src/pybind/mgr/cephadm/tests/test_services.py | 2 +- .../ceph/deployment/service_spec.py | 29 ++++++++++++++++++- 5 files changed, 45 insertions(+), 5 deletions(-) diff --git a/src/cephadm/cephadm b/src/cephadm/cephadm index d92451165d2..6e097f863a8 100755 --- a/src/cephadm/cephadm +++ b/src/cephadm/cephadm @@ -2637,6 +2637,9 @@ def get_daemon_args(ctx, fsid, daemon_type, daemon_id): port = meta['ports'][0] r += [f'--web.listen-address={ip}:{port}'] if daemon_type == 'prometheus': + config = get_parm(ctx.config_json) + retention_time = config.get('retention_time', '15d') + r += [f'--storage.tsdb.retention.time={retention_time}'] scheme = 'http' host = get_fqdn() r += [f'--web.external-url={scheme}://{host}:{port}'] diff --git a/src/cephadm/tests/test_cephadm.py b/src/cephadm/tests/test_cephadm.py index 937d8629041..2702c81eddb 100644 --- a/src/cephadm/tests/test_cephadm.py +++ b/src/cephadm/tests/test_cephadm.py @@ -1160,6 +1160,7 @@ class TestMonitoring(object): def test_prometheus_external_url(self): ctx = cd.CephadmContext() + ctx.config_json = json.dumps({'files': {}, 'retention_time': '15d'}) daemon_type = 'prometheus' daemon_id = 'home' fsid = 'aaf5a720-13fe-4a3b-82b9-2d99b7fd9704' diff --git a/src/pybind/mgr/cephadm/services/monitoring.py b/src/pybind/mgr/cephadm/services/monitoring.py index 107a4f74d06..f111e00bb2d 100644 --- a/src/pybind/mgr/cephadm/services/monitoring.py +++ b/src/pybind/mgr/cephadm/services/monitoring.py @@ -9,7 +9,8 @@ from urllib.parse import urlparse from mgr_module import HandleCommandResult from orchestrator import DaemonDescription -from ceph.deployment.service_spec import AlertManagerSpec, GrafanaSpec, ServiceSpec, SNMPGatewaySpec +from ceph.deployment.service_spec import AlertManagerSpec, GrafanaSpec, ServiceSpec, \ + SNMPGatewaySpec, PrometheusSpec from cephadm.services.cephadmservice import CephadmService, CephadmDaemonDeploySpec from mgr_util import verify_tls, ServerConfigException, create_self_signed_cert, build_url @@ -290,6 +291,13 @@ class PrometheusService(CephadmService): assert self.TYPE == daemon_spec.daemon_type + spec = cast(PrometheusSpec, self.mgr.spec_store[daemon_spec.service_name].spec) + + try: + retention_time = spec.retention_time if spec.retention_time else '15d' + except AttributeError: + retention_time = '15d' + t = self.mgr.get('mgr_map').get('services', {}).get('prometheus', None) sd_port = self.mgr.service_discovery_port srv_end_point = '' @@ -319,11 +327,12 @@ class PrometheusService(CephadmService): 'haproxy_sd_url': haproxy_sd_url, } - r = { + r: Dict[str, Any] = { 'files': { 'prometheus.yml': self.mgr.template.render('services/prometheus/prometheus.yml.j2', context), 'root_cert.pem': self.mgr.http_server.service_discovery.ssl_certs.get_root_cert() - } + }, + 'retention_time': retention_time } # include alerts, if present in the container diff --git a/src/pybind/mgr/cephadm/tests/test_services.py b/src/pybind/mgr/cephadm/tests/test_services.py index d6f16834e24..22e58317af0 100644 --- a/src/pybind/mgr/cephadm/tests/test_services.py +++ b/src/pybind/mgr/cephadm/tests/test_services.py @@ -431,7 +431,7 @@ class TestMonitoring: '--tcp-ports', '9095' ], stdin=json.dumps({"files": {"prometheus.yml": y, "root_cert.pem": '', - "/etc/prometheus/alerting/custom_alerts.yml": ""}}), + "/etc/prometheus/alerting/custom_alerts.yml": ""}, 'retention_time': '15d'}), image='') @patch("cephadm.serve.CephadmServe._run_cephadm") diff --git a/src/python-common/ceph/deployment/service_spec.py b/src/python-common/ceph/deployment/service_spec.py index 62543e692d6..9ff800f42ea 100644 --- a/src/python-common/ceph/deployment/service_spec.py +++ b/src/python-common/ceph/deployment/service_spec.py @@ -520,7 +520,7 @@ class ServiceSpec(object): 'container': CustomContainerSpec, 'grafana': GrafanaSpec, 'node-exporter': MonitoringSpec, - 'prometheus': MonitoringSpec, + 'prometheus': PrometheusSpec, 'loki': MonitoringSpec, 'promtail': MonitoringSpec, 'snmp-gateway': SNMPGatewaySpec, @@ -1261,6 +1261,33 @@ class GrafanaSpec(MonitoringSpec): yaml.add_representer(GrafanaSpec, ServiceSpec.yaml_representer) +class PrometheusSpec(MonitoringSpec): + def __init__(self, + service_type: str = 'prometheus', + service_id: Optional[str] = None, + placement: Optional[PlacementSpec] = None, + unmanaged: bool = False, + preview_only: bool = False, + config: Optional[Dict[str, str]] = None, + networks: Optional[List[str]] = None, + port: Optional[int] = None, + retention_time: Optional[str] = None, + extra_container_args: Optional[List[str]] = None, + custom_configs: Optional[List[CustomConfig]] = None, + ): + assert service_type == 'prometheus' + super(PrometheusSpec, self).__init__( + 'prometheus', service_id=service_id, + placement=placement, unmanaged=unmanaged, + preview_only=preview_only, config=config, networks=networks, port=port, + extra_container_args=extra_container_args, custom_configs=custom_configs) + + self.retention_time = retention_time + + +yaml.add_representer(PrometheusSpec, ServiceSpec.yaml_representer) + + class SNMPGatewaySpec(ServiceSpec): class SNMPVersion(str, enum.Enum): V2c = 'V2c' -- 2.39.5