]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
mgr/cephadm: Adding --storage.tsdb.retention.size prometheus option 51647/head
authorRedouane Kachach <rkachach@redhat.com>
Wed, 7 Sep 2022 12:51:10 +0000 (14:51 +0200)
committerAdam King <adking@redhat.com>
Thu, 25 May 2023 16:36:09 +0000 (12:36 -0400)
fixes: https://tracker.ceph.com/issues/57422

Signed-off-by: Redouane Kachach <rkachach@redhat.com>
(cherry picked from commit 4da92c59597dcbf0bba4be50db73233e34108ca9)

Conflicts:
src/pybind/mgr/cephadm/module.py
src/pybind/mgr/cephadm/services/monitoring.py
src/pybind/mgr/cephadm/tests/test_cephadm.py
src/pybind/mgr/cephadm/tests/test_services.py

doc/cephadm/services/monitoring.rst
src/cephadm/cephadm
src/pybind/mgr/cephadm/module.py
src/pybind/mgr/cephadm/services/monitoring.py
src/pybind/mgr/cephadm/tests/test_cephadm.py
src/pybind/mgr/cephadm/tests/test_services.py
src/python-common/ceph/deployment/service_spec.py

index 86e3e3f699abbf48cf410eba38ff8b593cd20dbd..f29a93e8298a4091cb661b77444378c161029001 100644 (file)
@@ -299,13 +299,16 @@ and the metrics will not be visible in Prometheus.
 Setting up Prometheus
 -----------------------
 
-Setting Prometheus Retention Time
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Setting Prometheus Retention Size and Time
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-Cephadm provides the option to set the Prometheus TDSB retention time using
-a ``retention_time`` field in the Prometheus service spec. The value defaults
-to 15 days (15d). If you would like a different value, such as 1 year (1y) you
-can apply a service spec similar to:
+Cephadm can configure Prometheus TSDB retention by specifying ``retention_time``
+and ``retention_size`` values in the Prometheus service spec.
+The retention time value defaults to 15 days (15d). Users can set a different value/unit where
+supported units are: 'y', 'w', 'd', 'h', 'm' and 's'. The retention size value defaults
+to 0 (disabled). Supported units in this case are: 'B', 'KB', 'MB', 'GB', 'TB', 'PB' and 'EB'.
+
+In the following example spec we set the retention time to 1 year and the size to 1GB.
 
 .. code-block:: yaml
 
@@ -314,6 +317,7 @@ can apply a service spec similar to:
       count: 1
     spec:
       retention_time: "1y"
+      retention_size: "1GB"
 
 .. note::
 
index c821d8d8a129a462f9d8a8c94edb86de362d9bc3..18b02d3bf2f999849eddc6601825d003c1788b5d 100755 (executable)
@@ -2394,7 +2394,9 @@ def get_daemon_args(ctx, fsid, daemon_type, daemon_id):
             if daemon_type == 'prometheus':
                 config = get_parm(ctx.config_json)
                 retention_time = config.get('retention_time', '15d')
+                retention_size = config.get('retention_size', '0')  # default to disabled
                 r += [f'--storage.tsdb.retention.time={retention_time}']
+                r += [f'--storage.tsdb.retention.size={retention_size}']
                 scheme = 'http'
                 host = get_fqdn()
                 # in case host is not an fqdn then we use the IP to
index 07f9479bd5174b60a6e7971f9476e749205d4687..682ea210748671cd741aad8c0b557673d3eeab4a 100644 (file)
@@ -26,7 +26,8 @@ from ceph.deployment import inventory
 from ceph.deployment.drive_group import DriveGroupSpec
 from ceph.deployment.service_spec import \
     ServiceSpec, PlacementSpec, \
-    HostPlacementSpec, IngressSpec, IscsiServiceSpec
+    HostPlacementSpec, IngressSpec, \
+    IscsiServiceSpec, PrometheusSpec
 from ceph.utils import str_to_datetime, datetime_to_str, datetime_now
 from cephadm.serve import CephadmServe
 from cephadm.services.cephadmservice import CephadmDaemonDeploySpec
@@ -2609,6 +2610,19 @@ Then run the following:
             # should only refresh if a change has been detected
             self._trigger_preview_refresh(specs=[cast(DriveGroupSpec, spec)])
 
+        if spec.service_type == 'prometheus':
+            spec = cast(PrometheusSpec, spec)
+            if spec.retention_time:
+                valid_units = ['y', 'w', 'd', 'h', 'm', 's']
+                m = re.search(rf"^(\d+)({'|'.join(valid_units)})$", spec.retention_time)
+                if not m:
+                    raise OrchestratorError(f"Invalid retention time. Valid units are: {', '.join(valid_units)}")
+            if spec.retention_size:
+                valid_units = ['B', 'KB', 'MB', 'GB', 'TB', 'PB', 'EB']
+                m = re.search(rf"^(\d+)({'|'.join(valid_units)})$", spec.retention_size)
+                if not m:
+                    raise OrchestratorError(f"Invalid retention size. Valid units are: {', '.join(valid_units)}")
+
         return self._apply_service_spec(cast(ServiceSpec, spec))
 
     def set_health_warning(self, name: str, summary: str, count: int, detail: List[str]) -> None:
index 8de7195a3ad9eb38986388fa0fe52dd48eb00206..479ea30f17817c06a33e56c88905be4502a508f5 100644 (file)
@@ -288,6 +288,12 @@ class PrometheusService(CephadmService):
         except AttributeError:
             retention_time = '15d'
 
+        try:
+            retention_size = prom_spec.retention_size if prom_spec.retention_size else '0'
+        except AttributeError:
+            # default to disabled
+            retention_size = '0'
+
         # scrape mgrs
         mgr_scrape_list = []
         mgr_map = self.mgr.get('mgr_map')
@@ -366,7 +372,8 @@ class PrometheusService(CephadmService):
                     self.mgr.template.render(
                         'services/prometheus/prometheus.yml.j2', context)
             },
-            'retention_time': retention_time
+            'retention_time': retention_time,
+            'retention_size': retention_size
         }
 
         # include alerts, if present in the container
index fa80bc4578d190f88ea831494bca27f77e96d036..b050aa6d018ec46eab77b596e3ec14c379b12e92 100644 (file)
@@ -17,7 +17,7 @@ from execnet.gateway_bootstrap import HostNotFound
 
 from ceph.deployment.service_spec import ServiceSpec, PlacementSpec, RGWSpec, \
     NFSServiceSpec, IscsiServiceSpec, HostPlacementSpec, CustomContainerSpec, MDSSpec, \
-    CustomConfig
+    CustomConfig, PrometheusSpec
 from ceph.deployment.drive_selection.selector import DriveSelection
 from ceph.deployment.inventory import Devices, Device
 from ceph.utils import datetime_to_str, datetime_now
@@ -1446,7 +1446,64 @@ class TestCephadm(object):
             with with_service(cephadm_module, spec, meth, 'test'):
                 pass
 
-    @mock.patch("cephadm.serve.CephadmServe._deploy_cephadm_binary", _deploy_cephadm_binary('test'))
+    @pytest.mark.parametrize(
+        "spec, raise_exception, msg",
+        [
+            # Valid retention_time values (valid units: 'y', 'w', 'd', 'h', 'm', 's')
+            (PrometheusSpec(retention_time='1y'), False, ''),
+            (PrometheusSpec(retention_time=' 10w '), False, ''),
+            (PrometheusSpec(retention_time=' 1348d'), False, ''),
+            (PrometheusSpec(retention_time='2000h '), False, ''),
+            (PrometheusSpec(retention_time='173847m'), False, ''),
+            (PrometheusSpec(retention_time='200s'), False, ''),
+            (PrometheusSpec(retention_time='  '), False, ''),  # default value will be used
+
+            # Invalid retention_time values
+            (PrometheusSpec(retention_time='100k'), True, '^Invalid retention time'),     # invalid unit
+            (PrometheusSpec(retention_time='10'), True, '^Invalid retention time'),       # no unit
+            (PrometheusSpec(retention_time='100.00y'), True, '^Invalid retention time'),  # invalid value and valid unit
+            (PrometheusSpec(retention_time='100.00k'), True, '^Invalid retention time'),  # invalid value and invalid unit
+            (PrometheusSpec(retention_time='---'), True, '^Invalid retention time'),      # invalid value
+
+            # Valid retention_size values (valid units: 'B', 'KB', 'MB', 'GB', 'TB', 'PB', 'EB')
+            (PrometheusSpec(retention_size='123456789B'), False, ''),
+            (PrometheusSpec(retention_size=' 200KB'), False, ''),
+            (PrometheusSpec(retention_size='99999MB '), False, ''),
+            (PrometheusSpec(retention_size=' 10GB '), False, ''),
+            (PrometheusSpec(retention_size='100TB'), False, ''),
+            (PrometheusSpec(retention_size='500PB'), False, ''),
+            (PrometheusSpec(retention_size='200EB'), False, ''),
+            (PrometheusSpec(retention_size='  '), False, ''),  # default value will be used
+
+            # Invalid retention_size values
+            (PrometheusSpec(retention_size='100b'), True, '^Invalid retention size'),      # invalid unit (case sensitive)
+            (PrometheusSpec(retention_size='333kb'), True, '^Invalid retention size'),     # invalid unit (case sensitive)
+            (PrometheusSpec(retention_size='2000'), True, '^Invalid retention size'),      # no unit
+            (PrometheusSpec(retention_size='200.00PB'), True, '^Invalid retention size'),  # invalid value and valid unit
+            (PrometheusSpec(retention_size='400.B'), True, '^Invalid retention size'),     # invalid value and invalid unit
+            (PrometheusSpec(retention_size='10.000s'), True, '^Invalid retention size'),   # invalid value and invalid unit
+            (PrometheusSpec(retention_size='...'), True, '^Invalid retention size'),       # invalid value
+
+            # valid retention_size and valid retention_time
+            (PrometheusSpec(retention_time='1y', retention_size='100GB'), False, ''),
+            # invalid retention_time and valid retention_size
+            (PrometheusSpec(retention_time='1j', retention_size='100GB'), True, '^Invalid retention time'),
+            # valid retention_time and invalid retention_size
+            (PrometheusSpec(retention_time='1y', retention_size='100gb'), True, '^Invalid retention size'),
+            # valid retention_time and invalid retention_size
+            (PrometheusSpec(retention_time='1y', retention_size='100gb'), True, '^Invalid retention size'),
+            # invalid retention_time and invalid retention_size
+            (PrometheusSpec(retention_time='1i', retention_size='100gb'), True, '^Invalid retention time'),
+        ])
+    @mock.patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('{}'))
+    def test_apply_prometheus(self, spec: PrometheusSpec, raise_exception: bool, msg: str, cephadm_module: CephadmOrchestrator):
+        with with_host(cephadm_module, 'test'):
+            if not raise_exception:
+                cephadm_module._apply(spec)
+            else:
+                with pytest.raises(OrchestratorError, match=msg):
+                    cephadm_module._apply(spec)
+
     @mock.patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('{}'))
     def test_mds_config_purge(self, cephadm_module: CephadmOrchestrator):
         spec = MDSSpec('mds', service_id='fsname', config={'test': 'foo'})
index 23bef8df726669d79dadf6dcdacf04cb7163444b..a979556c046deae9b4a23093cde5e663b4dae3ad 100644 (file)
@@ -17,7 +17,8 @@ from cephadm.services.monitoring import GrafanaService, AlertmanagerService, Pro
 from cephadm.services.exporter import CephadmExporter
 from cephadm.module import CephadmOrchestrator
 from ceph.deployment.service_spec import IscsiServiceSpec, MonitoringSpec, AlertManagerSpec, \
-    ServiceSpec, RGWSpec, GrafanaSpec, SNMPGatewaySpec, IngressSpec, PlacementSpec, NFSServiceSpec
+    ServiceSpec, RGWSpec, GrafanaSpec, SNMPGatewaySpec, IngressSpec, PlacementSpec, \
+    NFSServiceSpec, PrometheusSpec
 from cephadm.tests.fixtures import with_host, with_service, _run_cephadm
 
 from ceph.utils import datetime_now
@@ -440,7 +441,7 @@ class TestMonitoring:
 
         with with_host(cephadm_module, 'test'):
             with with_service(cephadm_module, MonitoringSpec('node-exporter')) as _, \
-                    with_service(cephadm_module, MonitoringSpec('prometheus')) as _:
+                    with_service(cephadm_module, PrometheusSpec('prometheus')) as _:
 
                 y = dedent("""
                 # This file is generated by cephadm.
@@ -478,7 +479,8 @@ class TestMonitoring:
                     ],
                     stdin=json.dumps({"files": {"prometheus.yml": y,
                                                 "/etc/prometheus/alerting/custom_alerts.yml": ""},
-                                      'retention_time': '15d'}),
+                                      'retention_time': '15d',
+                                      'retention_size': '0'}),
                     image='')
 
     @patch("cephadm.serve.CephadmServe._run_cephadm")
@@ -490,7 +492,7 @@ class TestMonitoring:
         with with_host(cephadm_module, 'test'):
             cephadm_module.set_store('test/grafana_crt', 'c')
             cephadm_module.set_store('test/grafana_key', 'k')
-            with with_service(cephadm_module, MonitoringSpec('prometheus')) as _, \
+            with with_service(cephadm_module, PrometheusSpec('prometheus')) as _, \
                     with_service(cephadm_module, GrafanaSpec('grafana')) as _:
                 files = {
                     'grafana.ini': dedent("""
index 8cd3ef0f0364f52ed031a0028f3dc01f2c125789..1c676e63963fa0cfbed36c43c2e01f078f84aceb 100644 (file)
@@ -1282,6 +1282,7 @@ class PrometheusSpec(MonitoringSpec):
                  networks: Optional[List[str]] = None,
                  port: Optional[int] = None,
                  retention_time: Optional[str] = None,
+                 retention_size: Optional[str] = None,
                  extra_container_args: Optional[List[str]] = None,
                  extra_entrypoint_args: Optional[List[str]] = None,
                  ):
@@ -1292,7 +1293,8 @@ class PrometheusSpec(MonitoringSpec):
             preview_only=preview_only, config=config, networks=networks, port=port,
             extra_container_args=extra_container_args, extra_entrypoint_args=extra_entrypoint_args)
 
-        self.retention_time = retention_time
+        self.retention_time = retention_time.strip() if retention_time else None
+        self.retention_size = retention_size.strip() if retention_size else None
 
 
 yaml.add_representer(PrometheusSpec, ServiceSpec.yaml_representer)