]> git.apps.os.sepia.ceph.com Git - ceph-ci.git/commitdiff
mgr/cephadm: Adding --storage.tsdb.retention.size prometheus option
authorRedouane Kachach <rkachach@redhat.com>
Wed, 7 Sep 2022 12:51:10 +0000 (14:51 +0200)
committerRedouane Kachach <rkachach@redhat.com>
Thu, 8 Sep 2022 11:51:08 +0000 (13:51 +0200)
fixes: https://tracker.ceph.com/issues/57422

Signed-off-by: Redouane Kachach <rkachach@redhat.com>
doc/cephadm/services/monitoring.rst
src/cephadm/cephadm
src/pybind/mgr/cephadm/module.py
src/pybind/mgr/cephadm/services/monitoring.py
src/pybind/mgr/cephadm/tests/test_cephadm.py
src/pybind/mgr/cephadm/tests/test_services.py
src/python-common/ceph/deployment/service_spec.py

index 157332564e5d14004aa5865d5dacc3e303a11a4f..0f67d3f0044e6d962fca6c45cc042f3c2e4f8c22 100644 (file)
@@ -341,13 +341,16 @@ and the metrics will not be visible in Prometheus.
 Setting up Prometheus
 -----------------------
 
-Setting Prometheus Retention Time
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Setting Prometheus Retention Size and Time
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-Cephadm provides the option to set the Prometheus TDSB retention time using
-a ``retention_time`` field in the Prometheus service spec. The value defaults
-to 15 days (15d). If you would like a different value, such as 1 year (1y) you
-can apply a service spec similar to:
+Cephadm can configure Prometheus TSDB retention by specifying ``retention_time``
+and ``retention_size`` values in the Prometheus service spec.
+The retention time value defaults to 15 days (15d). Users can set a different value/unit where
+supported units are: 'y', 'w', 'd', 'h', 'm' and 's'. The retention size value defaults
+to 0 (disabled). Supported units in this case are: 'B', 'KB', 'MB', 'GB', 'TB', 'PB' and 'EB'.
+
+In the following example spec we set the retention time to 1 year and the size to 1GB.
 
 .. code-block:: yaml
 
@@ -356,6 +359,7 @@ can apply a service spec similar to:
       count: 1
     spec:
       retention_time: "1y"
+      retention_size: "1GB"
 
 .. note::
 
index fa03cb6441c22618cfa1addc8b9b3a1a0a561931..6515a5fafbf790d037db40f1257c7fca0e98ed56 100755 (executable)
@@ -2639,7 +2639,9 @@ def get_daemon_args(ctx, fsid, daemon_type, daemon_id):
             if daemon_type == 'prometheus':
                 config = get_parm(ctx.config_json)
                 retention_time = config.get('retention_time', '15d')
+                retention_size = config.get('retention_size', '0')  # default to disabled
                 r += [f'--storage.tsdb.retention.time={retention_time}']
+                r += [f'--storage.tsdb.retention.size={retention_size}']
                 scheme = 'http'
                 host = get_fqdn()
                 r += [f'--web.external-url={scheme}://{host}:{port}']
index e3d90f13aebd24b5efaea19e43c7a4dc406ba2c5..d0a268f0c4c385bd7a0a2f088180f36e31e2efdf 100644 (file)
@@ -28,7 +28,7 @@ from ceph.deployment.drive_group import DriveGroupSpec
 from ceph.deployment.service_spec import \
     ServiceSpec, PlacementSpec, \
     HostPlacementSpec, IngressSpec, \
-    TunedProfileSpec
+    TunedProfileSpec, PrometheusSpec
 from ceph.utils import str_to_datetime, datetime_to_str, datetime_now
 from cephadm.serve import CephadmServe
 from cephadm.services.cephadmservice import CephadmDaemonDeploySpec
@@ -2533,6 +2533,19 @@ Then run the following:
             # should only refresh if a change has been detected
             self._trigger_preview_refresh(specs=[cast(DriveGroupSpec, spec)])
 
+        if spec.service_type == 'prometheus':
+            spec = cast(PrometheusSpec, spec)
+            if spec.retention_time:
+                valid_units = ['y', 'w', 'd', 'h', 'm', 's']
+                m = re.search(rf"^(\d+)({'|'.join(valid_units)})$", spec.retention_time)
+                if not m:
+                    raise OrchestratorError(f"Invalid retention time. Valid units are: {', '.join(valid_units)}")
+            if spec.retention_size:
+                valid_units = ['B', 'KB', 'MB', 'GB', 'TB', 'PB', 'EB']
+                m = re.search(rf"^(\d+)({'|'.join(valid_units)})$", spec.retention_size)
+                if not m:
+                    raise OrchestratorError(f"Invalid retention size. Valid units are: {', '.join(valid_units)}")
+
         return self._apply_service_spec(cast(ServiceSpec, spec))
 
     @handle_orch_error
index f111e00bb2def358d6a60d1dc8bbea0743e98f05..5942a92597bfa378ce50758f8a7016b9c733b9fc 100644 (file)
@@ -298,6 +298,12 @@ class PrometheusService(CephadmService):
         except AttributeError:
             retention_time = '15d'
 
+        try:
+            retention_size = spec.retention_size if spec.retention_size else '0'
+        except AttributeError:
+            # default to disabled
+            retention_size = '0'
+
         t = self.mgr.get('mgr_map').get('services', {}).get('prometheus', None)
         sd_port = self.mgr.service_discovery_port
         srv_end_point = ''
@@ -332,7 +338,8 @@ class PrometheusService(CephadmService):
                 'prometheus.yml': self.mgr.template.render('services/prometheus/prometheus.yml.j2', context),
                 'root_cert.pem': self.mgr.http_server.service_discovery.ssl_certs.get_root_cert()
             },
-            'retention_time': retention_time
+            'retention_time': retention_time,
+            'retention_size': retention_size
         }
 
         # include alerts, if present in the container
index 354ee338a5cfb768ec6e8c58b295a60f10965da0..ccf3270c1d63e2cbed1ea5e3233a056b097f73a2 100644 (file)
@@ -17,7 +17,7 @@ except ImportError:
 
 from ceph.deployment.service_spec import ServiceSpec, PlacementSpec, RGWSpec, \
     NFSServiceSpec, IscsiServiceSpec, HostPlacementSpec, CustomContainerSpec, MDSSpec, \
-    CustomConfig
+    CustomConfig, PrometheusSpec
 from ceph.deployment.drive_selection.selector import DriveSelection
 from ceph.deployment.inventory import Devices, Device
 from ceph.utils import datetime_to_str, datetime_now
@@ -1508,6 +1508,64 @@ class TestCephadm(object):
             with with_service(cephadm_module, spec, meth, 'test'):
                 pass
 
+    @pytest.mark.parametrize(
+        "spec, raise_exception, msg",
+        [
+            # Valid retention_time values (valid units: 'y', 'w', 'd', 'h', 'm', 's')
+            (PrometheusSpec(retention_time='1y'), False, ''),
+            (PrometheusSpec(retention_time=' 10w '), False, ''),
+            (PrometheusSpec(retention_time=' 1348d'), False, ''),
+            (PrometheusSpec(retention_time='2000h '), False, ''),
+            (PrometheusSpec(retention_time='173847m'), False, ''),
+            (PrometheusSpec(retention_time='200s'), False, ''),
+            (PrometheusSpec(retention_time='  '), False, ''),  # default value will be used
+
+            # Invalid retention_time values
+            (PrometheusSpec(retention_time='100k'), True, '^Invalid retention time'),     # invalid unit
+            (PrometheusSpec(retention_time='10'), True, '^Invalid retention time'),       # no unit
+            (PrometheusSpec(retention_time='100.00y'), True, '^Invalid retention time'),  # invalid value and valid unit
+            (PrometheusSpec(retention_time='100.00k'), True, '^Invalid retention time'),  # invalid value and invalid unit
+            (PrometheusSpec(retention_time='---'), True, '^Invalid retention time'),      # invalid value
+
+            # Valid retention_size values (valid units: 'B', 'KB', 'MB', 'GB', 'TB', 'PB', 'EB')
+            (PrometheusSpec(retention_size='123456789B'), False, ''),
+            (PrometheusSpec(retention_size=' 200KB'), False, ''),
+            (PrometheusSpec(retention_size='99999MB '), False, ''),
+            (PrometheusSpec(retention_size=' 10GB '), False, ''),
+            (PrometheusSpec(retention_size='100TB'), False, ''),
+            (PrometheusSpec(retention_size='500PB'), False, ''),
+            (PrometheusSpec(retention_size='200EB'), False, ''),
+            (PrometheusSpec(retention_size='  '), False, ''),  # default value will be used
+
+            # Invalid retention_size values
+            (PrometheusSpec(retention_size='100b'), True, '^Invalid retention size'),      # invalid unit (case sensitive)
+            (PrometheusSpec(retention_size='333kb'), True, '^Invalid retention size'),     # invalid unit (case sensitive)
+            (PrometheusSpec(retention_size='2000'), True, '^Invalid retention size'),      # no unit
+            (PrometheusSpec(retention_size='200.00PB'), True, '^Invalid retention size'),  # invalid value and valid unit
+            (PrometheusSpec(retention_size='400.B'), True, '^Invalid retention size'),     # invalid value and invalid unit
+            (PrometheusSpec(retention_size='10.000s'), True, '^Invalid retention size'),   # invalid value and invalid unit
+            (PrometheusSpec(retention_size='...'), True, '^Invalid retention size'),       # invalid value
+
+            # valid retention_size and valid retention_time
+            (PrometheusSpec(retention_time='1y', retention_size='100GB'), False, ''),
+            # invalid retention_time and valid retention_size
+            (PrometheusSpec(retention_time='1j', retention_size='100GB'), True, '^Invalid retention time'),
+            # valid retention_time and invalid retention_size
+            (PrometheusSpec(retention_time='1y', retention_size='100gb'), True, '^Invalid retention size'),
+            # valid retention_time and invalid retention_size
+            (PrometheusSpec(retention_time='1y', retention_size='100gb'), True, '^Invalid retention size'),
+            # invalid retention_time and invalid retention_size
+            (PrometheusSpec(retention_time='1i', retention_size='100gb'), True, '^Invalid retention time'),
+        ])
+    @mock.patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('{}'))
+    def test_apply_prometheus(self, spec: PrometheusSpec, raise_exception: bool, msg: str, cephadm_module: CephadmOrchestrator):
+        with with_host(cephadm_module, 'test'):
+            if not raise_exception:
+                cephadm_module._apply(spec)
+            else:
+                with pytest.raises(OrchestratorError, match=msg):
+                    cephadm_module._apply(spec)
+
     @mock.patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('{}'))
     def test_mds_config_purge(self, cephadm_module: CephadmOrchestrator):
         spec = MDSSpec('mds', service_id='fsname', config={'test': 'foo'})
index 22e58317af0d7b6d821c518ffaab67fa98526fde..8655a7119a8cc7ba4acff58f2e76926bac2555ed 100644 (file)
@@ -17,7 +17,7 @@ from cephadm.services.monitoring import GrafanaService, AlertmanagerService, Pro
     NodeExporterService, LokiService, PromtailService
 from cephadm.module import CephadmOrchestrator
 from ceph.deployment.service_spec import IscsiServiceSpec, MonitoringSpec, AlertManagerSpec, \
-    ServiceSpec, RGWSpec, GrafanaSpec, SNMPGatewaySpec, IngressSpec, PlacementSpec, TracingSpec
+    ServiceSpec, RGWSpec, GrafanaSpec, SNMPGatewaySpec, IngressSpec, PlacementSpec, TracingSpec, PrometheusSpec
 from cephadm.tests.fixtures import with_host, with_service, _run_cephadm, async_side_effect
 
 from orchestrator import OrchestratorError
@@ -392,7 +392,7 @@ class TestMonitoring:
 
         with with_host(cephadm_module, 'test'):
             with with_service(cephadm_module, MonitoringSpec('node-exporter')) as _, \
-                    with_service(cephadm_module, MonitoringSpec('prometheus')) as _:
+                    with_service(cephadm_module, PrometheusSpec('prometheus')) as _:
 
                 y = dedent("""
                 # This file is generated by cephadm.
@@ -431,7 +431,9 @@ class TestMonitoring:
                         '--tcp-ports', '9095'
                     ],
                     stdin=json.dumps({"files": {"prometheus.yml": y, "root_cert.pem": '',
-                                                "/etc/prometheus/alerting/custom_alerts.yml": ""}, 'retention_time': '15d'}),
+                                                "/etc/prometheus/alerting/custom_alerts.yml": ""},
+                                      'retention_time': '15d',
+                                      'retention_size': '0'}),
                     image='')
 
     @patch("cephadm.serve.CephadmServe._run_cephadm")
@@ -536,7 +538,7 @@ class TestMonitoring:
             cephadm_module.set_store("test/grafana_crt", "c")
             cephadm_module.set_store("test/grafana_key", "k")
             with with_service(
-                cephadm_module, MonitoringSpec("prometheus")
+                cephadm_module, PrometheusSpec("prometheus")
             ) as _, with_service(cephadm_module, ServiceSpec("mgr")) as _, with_service(
                 cephadm_module, GrafanaSpec("grafana")
             ) as _:
index 9ff800f42eaf17d6db15786e752c21ef282ec59b..16db5ed7cc4e423712aca738e1d59a370576daf0 100644 (file)
@@ -1272,6 +1272,7 @@ class PrometheusSpec(MonitoringSpec):
                  networks: Optional[List[str]] = None,
                  port: Optional[int] = None,
                  retention_time: Optional[str] = None,
+                 retention_size: Optional[str] = None,
                  extra_container_args: Optional[List[str]] = None,
                  custom_configs: Optional[List[CustomConfig]] = None,
                  ):
@@ -1282,7 +1283,8 @@ class PrometheusSpec(MonitoringSpec):
             preview_only=preview_only, config=config, networks=networks, port=port,
             extra_container_args=extra_container_args, custom_configs=custom_configs)
 
-        self.retention_time = retention_time
+        self.retention_time = retention_time.strip() if retention_time else None
+        self.retention_size = retention_size.strip() if retention_size else None
 
 
 yaml.add_representer(PrometheusSpec, ServiceSpec.yaml_representer)