From f967ac061ebee362cdc82c458e955da75a9045e9 Mon Sep 17 00:00:00 2001 From: Redouane Kachach Date: Wed, 25 May 2022 12:27:23 +0200 Subject: [PATCH] mgr/cephadm: enabling security on prometheus monitoring stack Signed-off-by: Redouane Kachach --- src/cephadm/cephadm.py | 24 +- src/pybind/mgr/cephadm/http_server.py | 8 +- src/pybind/mgr/cephadm/module.py | 84 +++++- src/pybind/mgr/cephadm/serve.py | 12 +- src/pybind/mgr/cephadm/service_discovery.py | 53 +++- src/pybind/mgr/cephadm/services/monitoring.py | 155 +++++++++-- .../services/alertmanager/alertmanager.yml.j2 | 4 + .../services/alertmanager/web.yml.j2 | 5 + .../services/grafana/ceph-dashboard.yml.j2 | 21 +- .../services/node-exporter/web.yml.j2 | 3 + .../services/prometheus/prometheus.yml.j2 | 50 +++- .../templates/services/prometheus/web.yml.j2 | 5 + .../cephadm/tests/test_service_discovery.py | 9 +- src/pybind/mgr/cephadm/tests/test_services.py | 255 ++++++++++++++++-- .../mgr/dashboard/controllers/prometheus.py | 58 +++- .../mgr/dashboard/services/access_control.py | 12 +- .../mgr/dashboard/tests/test_prometheus.py | 18 +- src/pybind/mgr/mgr_util.py | 11 + src/pybind/mgr/orchestrator/_interface.py | 8 + src/pybind/mgr/orchestrator/module.py | 12 + src/pybind/mgr/prometheus/module.py | 71 +++-- src/pybind/mgr/requirements-required.txt | 1 + 22 files changed, 757 insertions(+), 122 deletions(-) create mode 100644 src/pybind/mgr/cephadm/templates/services/alertmanager/web.yml.j2 create mode 100644 src/pybind/mgr/cephadm/templates/services/node-exporter/web.yml.j2 create mode 100644 src/pybind/mgr/cephadm/templates/services/prometheus/web.yml.j2 diff --git a/src/cephadm/cephadm.py b/src/cephadm/cephadm.py index 07dcecd0652..ec67567b67e 100755 --- a/src/cephadm/cephadm.py +++ b/src/cephadm/cephadm.py @@ -589,7 +589,7 @@ class Monitoring(object): 'cpus': '1', 'memory': '1GB', 'args': [ - '--no-collector.timex', + '--no-collector.timex' ], }, 'grafana': { @@ -2711,11 +2711,26 @@ def get_daemon_args(ctx, fsid, daemon_type, daemon_id): peers = config.get('peers', list()) # type: ignore for peer in peers: r += ['--cluster.peer={}'.format(peer)] + try: + r += [f'--web.config.file={config["web_config"]}'] + except KeyError: + pass # some alertmanager, by default, look elsewhere for a config r += ['--config.file=/etc/alertmanager/alertmanager.yml'] if daemon_type == 'promtail': r += ['--config.expand-env'] + if daemon_type == 'prometheus': + config = get_parm(ctx.config_json) + try: + r += [f'--web.config.file={config["web_config"]}'] + except KeyError: + pass if daemon_type == 'node-exporter': + config = get_parm(ctx.config_json) + try: + r += [f'--web.config={config["web_config"]}'] + except KeyError: + pass r += ['--path.procfs=/host/proc', '--path.sysfs=/host/sys', '--path.rootfs=/rootfs'] @@ -2806,6 +2821,12 @@ def create_daemon_dirs(ctx, fsid, daemon_type, daemon_id, uid, gid, config_dir = 'etc/loki' makedirs(os.path.join(data_dir_root, config_dir), uid, gid, 0o755) makedirs(os.path.join(data_dir_root, 'data'), uid, gid, 0o755) + elif daemon_type == 'node-exporter': + data_dir_root = get_data_dir(fsid, ctx.data_dir, + daemon_type, daemon_id) + config_dir = 'etc/node-exporter' + makedirs(os.path.join(data_dir_root, config_dir), uid, gid, 0o755) + recursive_chown(os.path.join(data_dir_root, 'etc'), uid, gid) # populate the config directory for the component from the config-json if 'files' in config_json: @@ -3039,6 +3060,7 @@ def get_container_mounts(ctx, fsid, daemon_type, daemon_id, mounts[log_dir] = '/var/log/ceph:z' mounts[os.path.join(data_dir, 'data')] = '/promtail:Z' elif daemon_type == 'node-exporter': + mounts[os.path.join(data_dir, 'etc/node-exporter')] = '/etc/node-exporter:Z' mounts['/proc'] = '/host/proc:ro' mounts['/sys'] = '/host/sys:ro' mounts['/'] = '/rootfs:ro' diff --git a/src/pybind/mgr/cephadm/http_server.py b/src/pybind/mgr/cephadm/http_server.py index 0c0b940aa94..ef29d3b4e75 100644 --- a/src/pybind/mgr/cephadm/http_server.py +++ b/src/pybind/mgr/cephadm/http_server.py @@ -31,6 +31,7 @@ class CephadmHttpServer(threading.Thread): self.service_discovery = ServiceDiscovery(mgr) self.cherrypy_shutdown_event = threading.Event() self._service_discovery_port = self.mgr.service_discovery_port + self.secure_monitoring_stack = self.mgr.secure_monitoring_stack super().__init__(target=self.run) def configure_cherrypy(self) -> None: @@ -42,10 +43,15 @@ class CephadmHttpServer(threading.Thread): def configure(self) -> None: self.configure_cherrypy() self.agent.configure() - self.service_discovery.configure(self.mgr.service_discovery_port, self.mgr.get_mgr_ip()) + self.service_discovery.configure(self.mgr.service_discovery_port, + self.mgr.get_mgr_ip(), + self.secure_monitoring_stack) def config_update(self) -> None: self.service_discovery_port = self.mgr.service_discovery_port + if self.secure_monitoring_stack != self.mgr.secure_monitoring_stack: + self.secure_monitoring_stack = self.mgr.secure_monitoring_stack + self.restart() @property def service_discovery_port(self) -> int: diff --git a/src/pybind/mgr/cephadm/module.py b/src/pybind/mgr/cephadm/module.py index ca4c1a482cf..2f646cabbb1 100644 --- a/src/pybind/mgr/cephadm/module.py +++ b/src/pybind/mgr/cephadm/module.py @@ -442,6 +442,36 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule, default=False, desc='Log all refresh metadata. Includes daemon, device, and host info collected regularly. Only has effect if logging at debug level' ), + Option( + 'prometheus_web_user', + type='str', + default='admin', + desc='Prometheus web user' + ), + Option( + 'prometheus_web_password', + type='str', + default='admin', + desc='Prometheus web password' + ), + Option( + 'alertmanager_web_user', + type='str', + default='admin', + desc='Alertmanager web user' + ), + Option( + 'alertmanager_web_password', + type='str', + default='admin', + desc='Alertmanager web password' + ), + Option( + 'secure_monitoring_stack', + type='bool', + default=False, + desc='Enable TLS security for all the monitoring stack daemons' + ), ] def __init__(self, *args: Any, **kwargs: Any): @@ -514,6 +544,11 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule, self.agent_down_multiplier = 0.0 self.agent_starting_port = 0 self.service_discovery_port = 0 + self.secure_monitoring_stack = False + self.prometheus_web_password: Optional[str] = None + self.prometheus_web_user: Optional[str] = None + self.alertmanager_web_password: Optional[str] = None + self.alertmanager_web_user: Optional[str] = None self.apply_spec_fails: List[Tuple[str, str]] = [] self.max_osd_draining_count = 10 self.device_enhanced_scan = False @@ -2462,6 +2497,14 @@ Then run the following: spec: Optional[ServiceSpec], daemon_type: str, daemon_id: str) -> List[str]: + + def get_daemon_names(daemons: List[str]) -> List[str]: + daemon_names = [] + for daemon_type in daemons: + for dd in self.cache.get_daemons_by_type(daemon_type): + daemon_names.append(dd.name()) + return daemon_names + deps = [] if daemon_type == 'haproxy': # because cephadm creates new daemon instances whenever @@ -2514,15 +2557,28 @@ Then run the following: deps.append('ingress') # add dependency on ceph-exporter daemons deps += [d.name() for d in self.cache.get_daemons_by_service('ceph-exporter')] + if self.secure_monitoring_stack: + if self.prometheus_web_user and self.prometheus_web_password: + deps.append(f'{hash(self.prometheus_web_user + self.prometheus_web_password)}') + if self.alertmanager_web_user and self.alertmanager_web_password: + deps.append(f'{hash(self.alertmanager_web_user + self.alertmanager_web_password)}') + elif daemon_type == 'grafana': + deps += get_daemon_names(['prometheus', 'loki']) + if self.secure_monitoring_stack and self.prometheus_web_user and self.prometheus_web_password: + deps.append(f'{hash(self.prometheus_web_user + self.prometheus_web_password)}') + elif daemon_type == 'alertmanager': + deps += get_daemon_names(['mgr', 'alertmanager', 'snmp-gateway']) + if self.secure_monitoring_stack and self.alertmanager_web_user and self.alertmanager_web_password: + deps.append(f'{hash(self.alertmanager_web_user + self.alertmanager_web_password)}') + elif daemon_type == 'promtail': + deps += get_daemon_names(['loki']) else: - need = { - 'grafana': ['prometheus', 'loki'], - 'alertmanager': ['mgr', 'alertmanager', 'snmp-gateway'], - 'promtail': ['loki'], - } - for dep_type in need.get(daemon_type, []): - for dd in self.cache.get_daemons_by_type(dep_type): - deps.append(dd.name()) + # TODO(redo): some error message! + pass + + if daemon_type in ['prometheus', 'node-exporter', 'alertmanager', 'grafana']: + deps.append(f'secure_monitoring_stack:{self.secure_monitoring_stack}') + return sorted(deps) @forall_hosts @@ -2613,6 +2669,18 @@ Then run the following: self.events.from_orch_error(e) raise + @handle_orch_error + def get_prometheus_access_info(self) -> Dict[str, str]: + return {'user': self.prometheus_web_user or '', + 'password': self.prometheus_web_password or '', + 'certificate': self.http_server.service_discovery.ssl_certs.get_root_cert()} + + @handle_orch_error + def get_alertmanager_access_info(self) -> Dict[str, str]: + return {'user': self.alertmanager_web_user or '', + 'password': self.alertmanager_web_password or '', + 'certificate': self.http_server.service_discovery.ssl_certs.get_root_cert()} + @handle_orch_error def apply_mon(self, spec: ServiceSpec) -> str: return self._apply(spec) diff --git a/src/pybind/mgr/cephadm/serve.py b/src/pybind/mgr/cephadm/serve.py index 0d92f014bba..b5ae1677c5d 100644 --- a/src/pybind/mgr/cephadm/serve.py +++ b/src/pybind/mgr/cephadm/serve.py @@ -971,11 +971,15 @@ class CephadmServe: dd.name())) action = 'reconfig' elif last_deps != deps: - self.log.debug('%s deps %s -> %s' % (dd.name(), last_deps, - deps)) - self.log.info('Reconfiguring %s (dependencies changed)...' % ( - dd.name())) + self.log.debug(f'{dd.name()} deps {last_deps} -> {deps}') + self.log.info(f'Reconfiguring {dd.name()} (dependencies changed)...') action = 'reconfig' + # we need only redeploy if secure_monitoring_stack value has changed: + if dd.daemon_type in ['prometheus', 'node-exporter', 'alertmanager']: + diff = list(set(last_deps) - set(deps)) + if any('secure_monitoring_stack' in e for e in diff): + action = 'redeploy' + elif spec is not None and hasattr(spec, 'extra_container_args') and dd.extra_container_args != spec.extra_container_args: self.log.debug( f'{dd.name()} container cli args {dd.extra_container_args} -> {spec.extra_container_args}') diff --git a/src/pybind/mgr/cephadm/service_discovery.py b/src/pybind/mgr/cephadm/service_discovery.py index 83d0f52a073..ddc0574e2b1 100644 --- a/src/pybind/mgr/cephadm/service_discovery.py +++ b/src/pybind/mgr/cephadm/service_discovery.py @@ -7,11 +7,14 @@ except ImportError: pass import logging +import socket + import orchestrator # noqa from mgr_module import ServiceInfoT from mgr_util import build_url -from typing import Dict, List, TYPE_CHECKING, cast, Collection, Callable, NamedTuple +from typing import Dict, List, TYPE_CHECKING, cast, Collection, Callable, NamedTuple, Optional from cephadm.services.monitoring import AlertmanagerService, NodeExporterService, PrometheusService +import secrets from cephadm.services.ingress import IngressSpec from cephadm.ssl_cert_utils import SSLCerts @@ -47,8 +50,13 @@ class ServiceDiscovery: def __init__(self, mgr: "CephadmOrchestrator") -> None: self.mgr = mgr self.ssl_certs = SSLCerts() + self.username: Optional[str] = None + self.password: Optional[str] = None + + def validate_password(self, realm: str, username: str, password: str) -> bool: + return (password == self.password and username == self.username) - def configure_routes(self, server: Server) -> None: + def configure_routes(self, server: Server, enable_auth: bool) -> None: ROUTES = [ Route('index', '/', server.index), Route('sd-config', '/prometheus/sd-config', server.get_sd_config), @@ -57,9 +65,28 @@ class ServiceDiscovery: d = cherrypy.dispatch.RoutesDispatcher() for route in ROUTES: d.connect(**route._asdict()) - conf = {'/': {'request.dispatch': d}} + if enable_auth: + conf = { + '/': { + 'request.dispatch': d, + 'tools.auth_basic.on': True, + 'tools.auth_basic.realm': 'localhost', + 'tools.auth_basic.checkpassword': self.validate_password + } + } + else: + conf = {'/': {'request.dispatch': d}} cherrypy.tree.mount(None, '/sd', config=conf) + def enable_auth(self) -> None: + self.username = self.mgr.get_store('service_discovery/root/username') + self.password = self.mgr.get_store('service_discovery/root/password') + if not self.password or not self.username: + self.username = 'admin' # TODO(redo): what should be the default username + self.password = secrets.token_urlsafe(20) + self.mgr.set_store('service_discovery/root/password', self.password) + self.mgr.set_store('service_discovery/root/username', self.username) + def configure_tls(self, server: Server) -> None: old_cert = self.mgr.get_store(self.KV_STORE_SD_ROOT_CERT) old_key = self.mgr.get_store(self.KV_STORE_SD_ROOT_KEY) @@ -69,16 +96,20 @@ class ServiceDiscovery: self.ssl_certs.generate_root_cert(self.mgr.get_mgr_ip()) self.mgr.set_store(self.KV_STORE_SD_ROOT_CERT, self.ssl_certs.get_root_cert()) self.mgr.set_store(self.KV_STORE_SD_ROOT_KEY, self.ssl_certs.get_root_key()) - - host = self.mgr.get_hostname() addr = self.mgr.get_mgr_ip() - server.ssl_certificate, server.ssl_private_key = self.ssl_certs.generate_cert_files(host, addr) + host_fqdn = socket.getfqdn(addr) + server.ssl_certificate, server.ssl_private_key = self.ssl_certs.generate_cert_files( + host_fqdn, addr) - def configure(self, port: int, addr: str) -> None: + def configure(self, port: int, addr: str, enable_security: bool) -> None: # we create a new server to enforce TLS/SSL config refresh self.root_server = Root(self.mgr, port, addr) - self.configure_tls(self.root_server) - self.configure_routes(self.root_server) + self.root_server.ssl_certificate = None + self.root_server.ssl_private_key = None + if enable_security: + self.enable_auth() + self.configure_tls(self.root_server) + self.configure_routes(self.root_server, enable_security) class Root(Server): @@ -95,7 +126,7 @@ class Root(Server): self.unsubscribe() super().stop() - def __init__(self, mgr: "CephadmOrchestrator", port: int, host: str): + def __init__(self, mgr: "CephadmOrchestrator", port: int = 0, host: str = ''): self.mgr = mgr super().__init__() self.socket_port = port @@ -142,7 +173,7 @@ class Root(Server): for server in servers: hostname = server.get('hostname', '') for service in cast(List[ServiceInfoT], server.get('services', [])): - if service['type'] != 'mgr': + if service['type'] != 'mgr' or service['id'] != self.mgr.get_mgr_id(): continue port = self.mgr.get_module_option_ex( 'prometheus', 'server_port', PrometheusService.DEFAULT_MGR_PROMETHEUS_PORT) diff --git a/src/pybind/mgr/cephadm/services/monitoring.py b/src/pybind/mgr/cephadm/services/monitoring.py index c709e5f5e9b..6c63ef6436a 100644 --- a/src/pybind/mgr/cephadm/services/monitoring.py +++ b/src/pybind/mgr/cephadm/services/monitoring.py @@ -12,7 +12,7 @@ from orchestrator import DaemonDescription from ceph.deployment.service_spec import AlertManagerSpec, GrafanaSpec, ServiceSpec, \ SNMPGatewaySpec, PrometheusSpec from cephadm.services.cephadmservice import CephadmService, CephadmDaemonDeploySpec -from mgr_util import verify_tls, ServerConfigException, create_self_signed_cert, build_url, get_cert_issuer_info +from mgr_util import verify_tls, ServerConfigException, create_self_signed_cert, build_url, get_cert_issuer_info, password_hash from ceph.deployment.utils import wrap_ipv6 logger = logging.getLogger(__name__) @@ -30,13 +30,17 @@ class GrafanaService(CephadmService): def generate_config(self, daemon_spec: CephadmDaemonDeploySpec) -> Tuple[Dict[str, Any], List[str]]: assert self.TYPE == daemon_spec.daemon_type deps = [] # type: List[str] + if self.mgr.secure_monitoring_stack and self.mgr.prometheus_web_user and self.mgr.prometheus_web_password: + deps.append(f'{hash(self.mgr.prometheus_web_user + self.mgr.prometheus_web_password)}') + deps.append(f'secure_monitoring_stack:{self.mgr.secure_monitoring_stack}') prom_services = [] # type: List[str] for dd in self.mgr.cache.get_daemons_by_service('prometheus'): assert dd.hostname is not None addr = dd.ip if dd.ip else self._inventory_get_fqdn(dd.hostname) port = dd.ports[0] if dd.ports else 9095 - prom_services.append(build_url(scheme='http', host=addr, port=port)) + protocol = 'https' if self.mgr.secure_monitoring_stack else 'http' + prom_services.append(build_url(scheme=protocol, host=addr, port=port)) deps.append(dd.name()) @@ -50,10 +54,18 @@ class GrafanaService(CephadmService): deps.append(dd.name()) - grafana_data_sources = self.mgr.template.render( - 'services/grafana/ceph-dashboard.yml.j2', {'hosts': prom_services, 'loki_host': loki_host}) - - spec: GrafanaSpec = cast(GrafanaSpec, self.mgr.spec_store.active_specs[daemon_spec.service_name]) + root_cert = self.mgr.http_server.service_discovery.ssl_certs.get_root_cert() + oneline_root_cert = '\\n'.join([line.strip() for line in root_cert.splitlines()]) + grafana_data_sources = self.mgr.template.render('services/grafana/ceph-dashboard.yml.j2', + {'hosts': prom_services, + 'prometheus_user': self.mgr.prometheus_web_user, + 'prometheus_password': self.mgr.prometheus_web_password, + 'cephadm_root_ca': oneline_root_cert, + 'security_enabled': self.mgr.secure_monitoring_stack, + 'loki_host': loki_host}) + + spec: GrafanaSpec = cast( + GrafanaSpec, self.mgr.spec_store.active_specs[daemon_spec.service_name]) grafana_ini = self.mgr.template.render( 'services/grafana/grafana.ini.j2', { 'initial_admin_password': spec.initial_admin_password, @@ -103,7 +115,8 @@ class GrafanaService(CephadmService): # that were originally generated by cephadm or in case cert/key are empty. if not certs_present or (org == 'Ceph' and cn == 'cephadm'): logger.info('Regenerating cephadm self-signed grafana TLS certificates') - cert, pkey = create_self_signed_cert('Ceph', daemon_spec.host) + host_fqdn = socket.getfqdn(daemon_spec.host) + cert, pkey = create_self_signed_cert('Ceph', host_fqdn) self.mgr.set_store(cert_path, cert) self.mgr.set_store(key_path, pkey) if 'dashboard' in self.mgr.get('mgr_map')['modules']: @@ -126,7 +139,8 @@ class GrafanaService(CephadmService): > ceph orch daemon reconfig """ - self.mgr.set_health_warning('CEPHADM_CERT_ERROR', 'Invalid grafana certificate: ', 1, [err_msg]) + self.mgr.set_health_warning( + 'CEPHADM_CERT_ERROR', 'Invalid grafana certificate: ', 1, [err_msg]) return cert, pkey @@ -220,6 +234,7 @@ class AlertmanagerService(CephadmService): f'{p_result.scheme}://{hostname}:{p_result.port}{p_result.path}') proto = p_result.scheme port = p_result.port + # scan all mgrs to generate deps and to get standbys too. # assume that they are all on the same port as the active mgr. for dd in self.mgr.cache.get_daemons_by_service('mgr'): @@ -244,6 +259,7 @@ class AlertmanagerService(CephadmService): port=dd.ports[0], path='/alerts')) context = { + 'secure_monitoring_stack': self.mgr.secure_monitoring_stack, 'dashboard_urls': dashboard_urls, 'default_webhook_urls': default_webhook_urls, 'snmp_gateway_urls': snmp_gateway_urls, @@ -259,12 +275,37 @@ class AlertmanagerService(CephadmService): addr = self._inventory_get_fqdn(dd.hostname) peers.append(build_url(host=addr, port=port).lstrip('/')) - return { - "files": { - "alertmanager.yml": yml - }, - "peers": peers - }, sorted(deps) + deps.append(f'secure_monitoring_stack:{self.mgr.secure_monitoring_stack}') + + if self.mgr.secure_monitoring_stack: + if self.mgr.alertmanager_web_user and self.mgr.alertmanager_web_password: + deps.append(f'{hash(self.mgr.alertmanager_web_user + self.mgr.alertmanager_web_password)}') + node_ip = self.mgr.inventory.get_addr(daemon_spec.host) + host_fqdn = self._inventory_get_fqdn(daemon_spec.host) + cert, key = self.mgr.http_server.service_discovery.ssl_certs.generate_cert( + host_fqdn, node_ip) + context = { + 'alertmanager_web_user': self.mgr.alertmanager_web_user, + 'alertmanager_web_password': password_hash(self.mgr.alertmanager_web_password), + } + return { + "files": { + "alertmanager.yml": yml, + 'alertmanager.crt': cert, + 'alertmanager.key': key, + 'web.yml': self.mgr.template.render('services/alertmanager/web.yml.j2', context), + 'root_cert.pem': self.mgr.http_server.service_discovery.ssl_certs.get_root_cert() + }, + 'peers': peers, + 'web_config': '/etc/alertmanager/web.yml' + }, sorted(deps) + else: + return { + "files": { + "alertmanager.yml": yml + }, + "peers": peers + }, sorted(deps) def get_active_daemon(self, daemon_descrs: List[DaemonDescription]) -> DaemonDescription: # TODO: if there are multiple daemons, who is the active one? @@ -278,7 +319,8 @@ class AlertmanagerService(CephadmService): assert dd.hostname is not None addr = dd.ip if dd.ip else self._inventory_get_fqdn(dd.hostname) port = dd.ports[0] if dd.ports else self.DEFAULT_SERVICE_PORT - service_url = build_url(scheme='http', host=addr, port=port) + protocol = 'https' if self.mgr.secure_monitoring_stack else 'http' + service_url = build_url(scheme=protocol, host=addr, port=port) self._set_service_url_on_dashboard( 'AlertManager', 'dashboard get-alertmanager-api-host', @@ -326,7 +368,6 @@ class PrometheusService(CephadmService): ) -> Tuple[Dict[str, Any], List[str]]: assert self.TYPE == daemon_spec.daemon_type - spec = cast(PrometheusSpec, self.mgr.spec_store[daemon_spec.service_name].spec) try: @@ -343,7 +384,8 @@ class PrometheusService(CephadmService): # build service discovery end-point port = self.mgr.service_discovery_port mgr_addr = wrap_ipv6(self.mgr.get_mgr_ip()) - srv_end_point = f'https://{mgr_addr}:{port}/sd/prometheus/sd-config?' + protocol = 'https' if self.mgr.secure_monitoring_stack else 'http' + srv_end_point = f'{protocol}://{mgr_addr}:{port}/sd/prometheus/sd-config?' node_exporter_cnt = len(self.mgr.cache.get_daemons_by_service('node-exporter')) alertmgr_cnt = len(self.mgr.cache.get_daemons_by_service('alertmanager')) @@ -356,6 +398,11 @@ class PrometheusService(CephadmService): # generate the prometheus configuration context = { + 'alertmanager_web_user': self.mgr.alertmanager_web_user, + 'alertmanager_web_password': self.mgr.alertmanager_web_password, + 'secure_monitoring_stack': self.mgr.secure_monitoring_stack, + 'service_discovery_username': self.mgr.http_server.service_discovery.username, + 'service_discovery_password': self.mgr.http_server.service_discovery.password, 'mgr_prometheus_sd_url': mgr_prometheus_sd_url, 'node_exporter_sd_url': node_exporter_sd_url, 'alertmanager_sd_url': alertmanager_sd_url, @@ -363,15 +410,43 @@ class PrometheusService(CephadmService): 'ceph_exporter_sd_url': ceph_exporter_sd_url } - r: Dict[str, Any] = { - 'files': { - 'prometheus.yml': self.mgr.template.render('services/prometheus/prometheus.yml.j2', context), - 'root_cert.pem': self.mgr.http_server.service_discovery.ssl_certs.get_root_cert() - }, - 'retention_time': retention_time, - 'retention_size': retention_size + web_context = { + 'prometheus_web_user': self.mgr.prometheus_web_user, + 'prometheus_web_password': password_hash(self.mgr.prometheus_web_password), } + if self.mgr.secure_monitoring_stack: + cfg_key = 'mgr/prometheus/root/cert' + cmd = {'prefix': 'config-key get', 'key': cfg_key} + ret, mgr_prometheus_rootca, err = self.mgr.mon_command(cmd) + if ret != 0: + logger.error(f'mon command to get config-key {cfg_key} failed: {err}') + else: + node_ip = self.mgr.inventory.get_addr(daemon_spec.host) + host_fqdn = self._inventory_get_fqdn(daemon_spec.host) + cert, key = self.mgr.http_server.service_discovery.ssl_certs.generate_cert(host_fqdn, node_ip) + r: Dict[str, Any] = { + 'files': { + 'prometheus.yml': self.mgr.template.render('services/prometheus/prometheus.yml.j2', context), + 'root_cert.pem': self.mgr.http_server.service_discovery.ssl_certs.get_root_cert(), + 'mgr_prometheus_cert.pem': mgr_prometheus_rootca, + 'web.yml': self.mgr.template.render('services/prometheus/web.yml.j2', web_context), + 'prometheus.crt': cert, + 'prometheus.key': key, + }, + 'retention_time': retention_time, + 'retention_size': retention_size, + 'web_config': '/etc/prometheus/web.yml' + } + else: + r = { + 'files': { + 'prometheus.yml': self.mgr.template.render('services/prometheus/prometheus.yml.j2', context) + }, + 'retention_time': retention_time, + 'retention_size': retention_size + } + # include alerts, if present in the container if os.path.exists(self.mgr.prometheus_alerts_path): with open(self.mgr.prometheus_alerts_path, 'r', encoding='utf-8') as f: @@ -399,13 +474,18 @@ class PrometheusService(CephadmService): def calculate_deps(self) -> List[str]: deps = [] # type: List[str] - port = cast(int, self.mgr.get_module_option_ex( - 'prometheus', 'server_port', self.DEFAULT_MGR_PROMETHEUS_PORT)) + port = cast(int, self.mgr.get_module_option_ex('prometheus', 'server_port', self.DEFAULT_MGR_PROMETHEUS_PORT)) deps.append(str(port)) deps.append(str(self.mgr.service_discovery_port)) # add an explicit dependency on the active manager. This will force to # re-deploy prometheus if the mgr has changed (due to a fail-over i.e). deps.append(self.mgr.get_active_mgr().name()) + if self.mgr.secure_monitoring_stack: + if self.mgr.prometheus_web_user and self.mgr.prometheus_web_password: + deps.append(f'{hash(self.mgr.prometheus_web_user + self.mgr.prometheus_web_password)}') + if self.mgr.alertmanager_web_user and self.mgr.alertmanager_web_password: + deps.append(f'{hash(self.mgr.alertmanager_web_user + self.mgr.alertmanager_web_password)}') + deps.append(f'secure_monitoring_stack:{self.mgr.secure_monitoring_stack}') # add dependency on ceph-exporter daemons deps += [d.name() for d in self.mgr.cache.get_daemons_by_service('ceph-exporter')] deps += [s for s in ['node-exporter', 'alertmanager'] if self.mgr.cache.get_daemons_by_service(s)] @@ -425,7 +505,8 @@ class PrometheusService(CephadmService): assert dd.hostname is not None addr = dd.ip if dd.ip else self._inventory_get_fqdn(dd.hostname) port = dd.ports[0] if dd.ports else self.DEFAULT_SERVICE_PORT - service_url = build_url(scheme='http', host=addr, port=port) + protocol = 'https' if self.mgr.secure_monitoring_stack else 'http' + service_url = build_url(scheme=protocol, host=addr, port=port) self._set_service_url_on_dashboard( 'Prometheus', 'dashboard get-prometheus-api-host', @@ -454,7 +535,25 @@ class NodeExporterService(CephadmService): def generate_config(self, daemon_spec: CephadmDaemonDeploySpec) -> Tuple[Dict[str, Any], List[str]]: assert self.TYPE == daemon_spec.daemon_type - return {}, [] + deps = [f'secure_monitoring_stack:{self.mgr.secure_monitoring_stack}'] + if self.mgr.secure_monitoring_stack: + node_ip = self.mgr.inventory.get_addr(daemon_spec.host) + host_fqdn = self._inventory_get_fqdn(daemon_spec.host) + cert, key = self.mgr.http_server.service_discovery.ssl_certs.generate_cert( + host_fqdn, node_ip) + r = { + 'files': { + 'web.yml': self.mgr.template.render('services/node-exporter/web.yml.j2', {}), + 'root_cert.pem': self.mgr.http_server.service_discovery.ssl_certs.get_root_cert(), + 'node_exporter.crt': cert, + 'node_exporter.key': key, + }, + 'web_config': '/etc/node-exporter/web.yml' + } + else: + r = {} + + return r, deps def ok_to_stop(self, daemon_ids: List[str], diff --git a/src/pybind/mgr/cephadm/templates/services/alertmanager/alertmanager.yml.j2 b/src/pybind/mgr/cephadm/templates/services/alertmanager/alertmanager.yml.j2 index 4e394106f05..b34a1fc17e2 100644 --- a/src/pybind/mgr/cephadm/templates/services/alertmanager/alertmanager.yml.j2 +++ b/src/pybind/mgr/cephadm/templates/services/alertmanager/alertmanager.yml.j2 @@ -6,8 +6,12 @@ global: {% if not secure %} http_config: tls_config: +{% if secure_monitoring_stack %} + ca_file: root_cert.pem +{% else %} insecure_skip_verify: true {% endif %} +{% endif %} route: receiver: 'default' diff --git a/src/pybind/mgr/cephadm/templates/services/alertmanager/web.yml.j2 b/src/pybind/mgr/cephadm/templates/services/alertmanager/web.yml.j2 new file mode 100644 index 00000000000..ef4f0b4c750 --- /dev/null +++ b/src/pybind/mgr/cephadm/templates/services/alertmanager/web.yml.j2 @@ -0,0 +1,5 @@ +tls_server_config: + cert_file: alertmanager.crt + key_file: alertmanager.key +basic_auth_users: + {{ alertmanager_web_user }}: {{ alertmanager_web_password }} diff --git a/src/pybind/mgr/cephadm/templates/services/grafana/ceph-dashboard.yml.j2 b/src/pybind/mgr/cephadm/templates/services/grafana/ceph-dashboard.yml.j2 index 7e5ffe5eaa8..46aea864f53 100644 --- a/src/pybind/mgr/cephadm/templates/services/grafana/ceph-dashboard.yml.j2 +++ b/src/pybind/mgr/cephadm/templates/services/grafana/ceph-dashboard.yml.j2 @@ -1,13 +1,12 @@ # {{ cephadm_managed }} +apiVersion: 1 + deleteDatasources: {% for host in hosts %} - name: 'Dashboard{{ loop.index }}' orgId: 1 {% endfor %} - - name: 'Loki' - orgId: 2 - datasources: {% for host in hosts %} - name: 'Dashboard{{ loop.index }}' @@ -15,16 +14,26 @@ datasources: access: 'proxy' orgId: 1 url: '{{ host }}' - basicAuth: false + basicAuth: {{ 'true' if security_enabled else 'false' }} isDefault: {{ 'true' if loop.first else 'false' }} editable: false +{% if security_enabled %} + basicAuthUser: {{ prometheus_user }} + jsonData: + graphiteVersion: "1.1" + tlsAuth: false + tlsAuthWithCACert: true + tlsSkipVerify: false + secureJsonData: + basicAuthPassword: {{ prometheus_password }} + tlsCACert: "{{ cephadm_root_ca }}" +{% endif %} {% endfor %} - name: 'Loki' type: 'loki' access: 'proxy' - orgId: 2 url: '{{ loki_host }}' basicAuth: false - isDefault: true + isDefault: false editable: false diff --git a/src/pybind/mgr/cephadm/templates/services/node-exporter/web.yml.j2 b/src/pybind/mgr/cephadm/templates/services/node-exporter/web.yml.j2 new file mode 100644 index 00000000000..1c122034518 --- /dev/null +++ b/src/pybind/mgr/cephadm/templates/services/node-exporter/web.yml.j2 @@ -0,0 +1,3 @@ +tls_server_config: + cert_file: node_exporter.crt + key_file: node_exporter.key diff --git a/src/pybind/mgr/cephadm/templates/services/prometheus/prometheus.yml.j2 b/src/pybind/mgr/cephadm/templates/services/prometheus/prometheus.yml.j2 index 65d1cc18f96..acbda6b9911 100644 --- a/src/pybind/mgr/cephadm/templates/services/prometheus/prometheus.yml.j2 +++ b/src/pybind/mgr/cephadm/templates/services/prometheus/prometheus.yml.j2 @@ -8,35 +8,83 @@ rule_files: {% if alertmanager_sd_url %} alerting: alertmanagers: - - scheme: http +{% if secure_monitoring_stack %} + - scheme: https + basic_auth: + username: {{ alertmanager_web_user }} + password: {{ alertmanager_web_password }} + tls_config: + ca_file: root_cert.pem http_sd_configs: - url: {{ alertmanager_sd_url }} + basic_auth: + username: {{ service_discovery_username }} + password: {{ service_discovery_password }} tls_config: ca_file: root_cert.pem +{% else %} + - scheme: http + http_sd_configs: + - url: {{ alertmanager_sd_url }} +{% endif %} {% endif %} scrape_configs: - job_name: 'ceph' +{% if secure_monitoring_stack %} + scheme: https + tls_config: + ca_file: mgr_prometheus_cert.pem honor_labels: true http_sd_configs: - url: {{ mgr_prometheus_sd_url }} + basic_auth: + username: {{ service_discovery_username }} + password: {{ service_discovery_password }} tls_config: ca_file: root_cert.pem +{% else %} + honor_labels: true + http_sd_configs: + - url: {{ mgr_prometheus_sd_url }} +{% endif %} {% if node_exporter_sd_url %} - job_name: 'node' +{% if secure_monitoring_stack %} + scheme: https + tls_config: + ca_file: root_cert.pem http_sd_configs: - url: {{ node_exporter_sd_url }} + basic_auth: + username: {{ service_discovery_username }} + password: {{ service_discovery_password }} tls_config: ca_file: root_cert.pem +{% else %} + http_sd_configs: + - url: {{ node_exporter_sd_url }} +{% endif %} {% endif %} {% if haproxy_sd_url %} - job_name: 'haproxy' +{% if secure_monitoring_stack %} + scheme: https + tls_config: + ca_file: root_cert.pem http_sd_configs: - url: {{ haproxy_sd_url }} + basic_auth: + username: {{ service_discovery_username }} + password: {{ service_discovery_password }} tls_config: ca_file: root_cert.pem +{% else %} + http_sd_configs: + - url: {{ haproxy_sd_url }} +{% endif %} {% endif %} {% if ceph_exporter_sd_url %} diff --git a/src/pybind/mgr/cephadm/templates/services/prometheus/web.yml.j2 b/src/pybind/mgr/cephadm/templates/services/prometheus/web.yml.j2 new file mode 100644 index 00000000000..da3c3d724e8 --- /dev/null +++ b/src/pybind/mgr/cephadm/templates/services/prometheus/web.yml.j2 @@ -0,0 +1,5 @@ +tls_server_config: + cert_file: prometheus.crt + key_file: prometheus.key +basic_auth_users: + {{ prometheus_web_user }}: {{ prometheus_web_password }} diff --git a/src/pybind/mgr/cephadm/tests/test_service_discovery.py b/src/pybind/mgr/cephadm/tests/test_service_discovery.py index 630218b549f..ff98a13885f 100644 --- a/src/pybind/mgr/cephadm/tests/test_service_discovery.py +++ b/src/pybind/mgr/cephadm/tests/test_service_discovery.py @@ -66,15 +66,18 @@ class FakeMgr: self.cache = FakeCache() self.spec_store = FakeSpecStore(self) + def get_mgr_id(self): + return 'mgr-1' + def list_servers(self): servers = [ {'hostname': 'node0', 'ceph_version': '16.2', - 'services': [{'type': 'mgr'}, {'type': 'mon'}]}, + 'services': [{'type': 'mgr', 'id': 'mgr-1'}, {'type': 'mon'}]}, {'hostname': 'node1', 'ceph_version': '16.2', - 'services': [{'type': 'mgr'}, {'type': 'mon'}]} + 'services': [{'type': 'mgr', 'id': 'mgr-2'}, {'type': 'mon'}]} ] return servers @@ -106,7 +109,7 @@ class TestServiceDiscovery: assert 'targets' in entry # check content - assert cfg[0]['targets'] == ['node0:9283', 'node1:9283'] + assert cfg[0]['targets'] == ['node0:9283'] def test_get_sd_config_node_exporter(self): mgr = FakeMgr() diff --git a/src/pybind/mgr/cephadm/tests/test_services.py b/src/pybind/mgr/cephadm/tests/test_services.py index aa94a79e6d0..eae1bd9b77b 100644 --- a/src/pybind/mgr/cephadm/tests/test_services.py +++ b/src/pybind/mgr/cephadm/tests/test_services.py @@ -395,12 +395,98 @@ class TestMonitoring: @patch("cephadm.serve.CephadmServe._run_cephadm") @patch("cephadm.module.CephadmOrchestrator.get_mgr_ip", lambda _: '::1') - def test_prometheus_config(self, _run_cephadm, cephadm_module: CephadmOrchestrator): + @patch("cephadm.services.monitoring.password_hash", lambda password: 'fake_password') + def test_alertmanager_config_security_enabled(self, _run_cephadm, cephadm_module: CephadmOrchestrator): _run_cephadm.side_effect = async_side_effect(('{}', '', 0)) + def gen_cert(host, addr): + return ('mycert', 'mykey') + + def get_root_cert(): + return 'my_root_cert' + + with with_host(cephadm_module, 'test'): + cephadm_module.secure_monitoring_stack = True + cephadm_module.alertmanager_web_password = 'fake_password' + cephadm_module.alertmanager_web_user = 'admin' + cephadm_module.http_server.service_discovery.ssl_certs.generate_cert = MagicMock(side_effect=gen_cert) + cephadm_module.http_server.service_discovery.ssl_certs.get_root_cert = MagicMock(side_effect=get_root_cert) + with with_service(cephadm_module, AlertManagerSpec()): + + y = dedent(""" + # This file is generated by cephadm. + # See https://prometheus.io/docs/alerting/configuration/ for documentation. + + global: + resolve_timeout: 5m + http_config: + tls_config: + ca_file: root_cert.pem + + route: + receiver: 'default' + routes: + - group_by: ['alertname'] + group_wait: 10s + group_interval: 10s + repeat_interval: 1h + receiver: 'ceph-dashboard' + + receivers: + - name: 'default' + webhook_configs: + - name: 'ceph-dashboard' + webhook_configs: + - url: 'http://localhost:8080/api/prometheus_receiver' + """).lstrip() + + web_config = dedent(""" + tls_server_config: + cert_file: alertmanager.crt + key_file: alertmanager.key + basic_auth_users: + admin: fake_password""").lstrip() + + _run_cephadm.assert_called_with( + 'test', + 'alertmanager.test', + 'deploy', + [ + '--name', 'alertmanager.test', + '--meta-json', '{"service_name": "alertmanager", "ports": [9093, 9094], "ip": null, "deployed_by": [], "rank": null, "rank_generation": null, "extra_container_args": null, "extra_entrypoint_args": null}', + '--config-json', '-', '--tcp-ports', '9093 9094' + ], + stdin=json.dumps({ + "files": { + "alertmanager.yml": y, + 'alertmanager.crt': 'mycert', + 'alertmanager.key': 'mykey', + 'web.yml': web_config, + 'root_cert.pem': 'my_root_cert' + }, + 'peers': [], + 'web_config': '/etc/alertmanager/web.yml' + }), + image='') + + @patch("cephadm.serve.CephadmServe._run_cephadm") + @patch("cephadm.module.CephadmOrchestrator.get_mgr_ip", lambda _: '::1') + def test_prometheus_config_security_disabled(self, _run_cephadm, cephadm_module: CephadmOrchestrator): + _run_cephadm.side_effect = async_side_effect(('{}', '', 0)) + s = RGWSpec(service_id="foo", placement=PlacementSpec(count=1), rgw_frontend_type='beast') with with_host(cephadm_module, 'test'): with with_service(cephadm_module, MonitoringSpec('node-exporter')) as _, \ with_service(cephadm_module, CephExporterSpec('ceph-exporter')) as _, \ + with_service(cephadm_module, s) as _, \ + with_service(cephadm_module, AlertManagerSpec('alertmanager')) as _, \ + with_service(cephadm_module, IngressSpec(service_id='ingress', + frontend_port=8089, + monitor_port=8999, + monitor_user='admin', + monitor_password='12345', + keepalived_password='12345', + virtual_ip="1.2.3.4/32", + backend_service='rgw.foo')) as _, \ with_service(cephadm_module, PrometheusSpec('prometheus')) as _: y = dedent(""" @@ -411,21 +497,149 @@ class TestMonitoring: rule_files: - /etc/prometheus/alerting/* + alerting: + alertmanagers: + - scheme: http + http_sd_configs: + - url: http://[::1]:8765/sd/prometheus/sd-config?service=alertmanager scrape_configs: - job_name: 'ceph' honor_labels: true http_sd_configs: + - url: http://[::1]:8765/sd/prometheus/sd-config?service=mgr-prometheus + + - job_name: 'node' + http_sd_configs: + - url: http://[::1]:8765/sd/prometheus/sd-config?service=node-exporter + + - job_name: 'haproxy' + http_sd_configs: + - url: http://[::1]:8765/sd/prometheus/sd-config?service=haproxy + + - job_name: 'ceph-exporter' + honor_labels: true + http_sd_configs: + - url: http://[::1]:8765/sd/prometheus/sd-config?service=ceph-exporter + tls_config: + ca_file: root_cert.pem + """).lstrip() + + _run_cephadm.assert_called_with( + 'test', + 'prometheus.test', + 'deploy', + [ + '--name', 'prometheus.test', + '--meta-json', + ('{"service_name": "prometheus", "ports": [9095], "ip": null, "deployed_by": [], "rank": null, ' + '"rank_generation": null, "extra_container_args": null, "extra_entrypoint_args": null}'), + '--config-json', '-', + '--tcp-ports', '9095' + ], + stdin=json.dumps({"files": {"prometheus.yml": y, + "/etc/prometheus/alerting/custom_alerts.yml": ""}, + 'retention_time': '15d', + 'retention_size': '0'}), + image='') + + @patch("cephadm.serve.CephadmServe._run_cephadm") + @patch("cephadm.module.CephadmOrchestrator.get_mgr_ip", lambda _: '::1') + @patch("cephadm.services.monitoring.password_hash", lambda password: 'fake_password') + def test_prometheus_config_security_enabled(self, _run_cephadm, cephadm_module: CephadmOrchestrator): + _run_cephadm.side_effect = async_side_effect(('{}', '', 0)) + s = RGWSpec(service_id="foo", placement=PlacementSpec(count=1), rgw_frontend_type='beast') + + def gen_cert(host, addr): + return ('mycert', 'mykey') + + with with_host(cephadm_module, 'test'): + cephadm_module.secure_monitoring_stack = True + cephadm_module.http_server.service_discovery.username = 'admin' + cephadm_module.http_server.service_discovery.password = 'fake_password' + cephadm_module.http_server.service_discovery.ssl_certs.generate_cert = MagicMock( + side_effect=gen_cert) + with with_service(cephadm_module, MonitoringSpec('node-exporter')) as _, \ + with_service(cephadm_module, s) as _, \ + with_service(cephadm_module, AlertManagerSpec('alertmanager')) as _, \ + with_service(cephadm_module, IngressSpec(service_id='ingress', + frontend_port=8089, + monitor_port=8999, + monitor_user='admin', + monitor_password='12345', + keepalived_password='12345', + virtual_ip="1.2.3.4/32", + backend_service='rgw.foo')) as _, \ + with_service(cephadm_module, PrometheusSpec('prometheus')) as _: + + web_config = dedent(""" + tls_server_config: + cert_file: prometheus.crt + key_file: prometheus.key + basic_auth_users: + admin: fake_password""").lstrip() + + y = dedent(""" + # This file is generated by cephadm. + global: + scrape_interval: 10s + evaluation_interval: 10s + rule_files: + - /etc/prometheus/alerting/* + + alerting: + alertmanagers: + - scheme: https + basic_auth: + username: admin + password: admin + tls_config: + ca_file: root_cert.pem + http_sd_configs: + - url: https://[::1]:8765/sd/prometheus/sd-config?service=alertmanager + basic_auth: + username: admin + password: fake_password + tls_config: + ca_file: root_cert.pem + + scrape_configs: + - job_name: 'ceph' + scheme: https + tls_config: + ca_file: mgr_prometheus_cert.pem + honor_labels: true + http_sd_configs: - url: https://[::1]:8765/sd/prometheus/sd-config?service=mgr-prometheus + basic_auth: + username: admin + password: fake_password tls_config: ca_file: root_cert.pem - job_name: 'node' + scheme: https + tls_config: + ca_file: root_cert.pem http_sd_configs: - url: https://[::1]:8765/sd/prometheus/sd-config?service=node-exporter + basic_auth: + username: admin + password: fake_password tls_config: ca_file: root_cert.pem + - job_name: 'haproxy' + scheme: https + tls_config: + ca_file: root_cert.pem + http_sd_configs: + - url: https://[::1]:8765/sd/prometheus/sd-config?service=haproxy + basic_auth: + username: admin + password: fake_password + tls_config: + ca_file: root_cert.pem - job_name: 'ceph-exporter' honor_labels: true @@ -442,16 +656,25 @@ class TestMonitoring: [ '--name', 'prometheus.test', '--meta-json', - ('{"service_name": "prometheus", "ports": [9095], "ip": null, "deployed_by": [], "rank": null, ' - '"rank_generation": null, "extra_container_args": null, "extra_entrypoint_args": null}'), + '{"service_name": "prometheus", "ports": [9095], "ip": null, "deployed_by": [], "rank": null, "rank_generation": null, "extra_container_args": null, "extra_entrypoint_args": null}', '--config-json', '-', '--tcp-ports', '9095' ], - stdin=json.dumps({"files": {"prometheus.yml": y, "root_cert.pem": '', - "/etc/prometheus/alerting/custom_alerts.yml": ""}, - 'retention_time': '15d', - 'retention_size': '0'}), - image='') + stdin=json.dumps({ + 'files': { + 'prometheus.yml': y, + 'root_cert.pem': '', + 'mgr_prometheus_cert.pem': '', + 'web.yml': web_config, + 'prometheus.crt': 'mycert', + 'prometheus.key': 'mykey', + "/etc/prometheus/alerting/custom_alerts.yml": "", + }, + 'retention_time': '15d', + 'retention_size': '0', + 'web_config': '/etc/prometheus/web.yml'}), + image='' + ) @patch("cephadm.serve.CephadmServe._run_cephadm") def test_loki_config(self, _run_cephadm, cephadm_module: CephadmOrchestrator): @@ -586,13 +809,12 @@ class TestMonitoring: allow_embedding = true""").lstrip(), # noqa: W291 'provisioning/datasources/ceph-dashboard.yml': dedent(""" # This file is generated by cephadm. + apiVersion: 1 + deleteDatasources: - name: 'Dashboard1' orgId: 1 - - name: 'Loki' - orgId: 2 - datasources: - name: 'Dashboard1' type: 'prometheus' @@ -606,10 +828,9 @@ class TestMonitoring: - name: 'Loki' type: 'loki' access: 'proxy' - orgId: 2 url: '' basicAuth: false - isDefault: true + isDefault: false editable: false""").lstrip(), 'certs/cert_file': dedent(f""" # generated by cephadm\n{grafana_cert}""").lstrip(), @@ -666,20 +887,18 @@ class TestMonitoring: ' allow_embedding = true', 'provisioning/datasources/ceph-dashboard.yml': "# This file is generated by cephadm.\n" + "apiVersion: 1\n\n" 'deleteDatasources:\n\n' - " - name: 'Loki'\n" - ' orgId: 2\n\n' 'datasources:\n\n' " - name: 'Loki'\n" " type: 'loki'\n" " access: 'proxy'\n" - ' orgId: 2\n' " url: ''\n" ' basicAuth: false\n' - ' isDefault: true\n' + ' isDefault: false\n' ' editable: false', 'certs/cert_file': ANY, - 'certs/cert_key': ANY}}, []) + 'certs/cert_key': ANY}}, ['secure_monitoring_stack:False']) @patch("cephadm.serve.CephadmServe._run_cephadm") def test_monitoring_ports(self, _run_cephadm, cephadm_module: CephadmOrchestrator): diff --git a/src/pybind/mgr/dashboard/controllers/prometheus.py b/src/pybind/mgr/dashboard/controllers/prometheus.py index ae4abfc1668..e183656b77c 100644 --- a/src/pybind/mgr/dashboard/controllers/prometheus.py +++ b/src/pybind/mgr/dashboard/controllers/prometheus.py @@ -1,10 +1,13 @@ # -*- coding: utf-8 -*- import json +import os +import tempfile from datetime import datetime import requests +from .. import mgr from ..exceptions import DashboardException from ..security import Scope from ..services import ceph_service @@ -29,15 +32,50 @@ class PrometheusReceiver(BaseController): class PrometheusRESTController(RESTController): def prometheus_proxy(self, method, path, params=None, payload=None): # type (str, str, dict, dict) - return self._proxy(self._get_api_url(Settings.PROMETHEUS_API_HOST), - method, path, 'Prometheus', params, payload, - verify=Settings.PROMETHEUS_API_SSL_VERIFY) + user, password, cert_file = self.get_access_info('prometheus') + verify = cert_file.name if cert_file else Settings.PROMETHEUS_API_SSL_VERIFY + response = self._proxy(self._get_api_url(Settings.PROMETHEUS_API_HOST), + method, path, 'Prometheus', params, payload, + user=user, password=password, verify=verify) + if cert_file: + cert_file.close() + os.unlink(cert_file.name) + return response def alert_proxy(self, method, path, params=None, payload=None): # type (str, str, dict, dict) - return self._proxy(self._get_api_url(Settings.ALERTMANAGER_API_HOST), - method, path, 'Alertmanager', params, payload, - verify=Settings.ALERTMANAGER_API_SSL_VERIFY) + user, password, cert_file = self.get_access_info('alertmanager') + verify = cert_file.name if cert_file else Settings.ALERTMANAGER_API_SSL_VERIFY + response = self._proxy(self._get_api_url(Settings.ALERTMANAGER_API_HOST), + method, path, 'Alertmanager', params, payload, + user=user, password=password, verify=verify) + if cert_file: + cert_file.close() + os.unlink(cert_file.name) + return response + + def get_access_info(self, module_name): + # type (str, str, str) + if module_name not in ['prometheus', 'alertmanager']: + raise DashboardException(f'Invalid module name {module_name}', component='prometheus') + user = None + password = None + cert_file = None + secure_monitoring_stack = bool(mgr.get_module_option_ex('cephadm', + 'secure_monitoring_stack', + 'false')) + if secure_monitoring_stack: + cmd = {'prefix': f'orch {module_name} access info'} + ret, out, _ = mgr.mon_command(cmd) + if ret == 0 and out is not None: + access_info = json.loads(out) + user = access_info['user'] + password = access_info['password'] + certificate = access_info['certificate'] + cert_file = tempfile.NamedTemporaryFile(delete=False) + cert_file.write(certificate.encode('utf-8')) + cert_file.flush() + return user, password, cert_file def _get_api_url(self, host): return host.rstrip('/') + '/api/v1' @@ -45,11 +83,15 @@ class PrometheusRESTController(RESTController): def balancer_status(self): return ceph_service.CephService.send_command('mon', 'balancer status') - def _proxy(self, base_url, method, path, api_name, params=None, payload=None, verify=True): + def _proxy(self, base_url, method, path, api_name, params=None, payload=None, verify=True, + user=None, password=None): # type (str, str, str, str, dict, dict, bool) try: + from requests.auth import HTTPBasicAuth + auth = HTTPBasicAuth(user, password) if user and password else None response = requests.request(method, base_url + path, params=params, - json=payload, verify=verify) + json=payload, verify=verify, + auth=auth) except Exception: raise DashboardException( "Could not reach {}'s API on {}".format(api_name, base_url), diff --git a/src/pybind/mgr/dashboard/services/access_control.py b/src/pybind/mgr/dashboard/services/access_control.py index d379d6e2146..789ccf67020 100644 --- a/src/pybind/mgr/dashboard/services/access_control.py +++ b/src/pybind/mgr/dashboard/services/access_control.py @@ -14,6 +14,7 @@ from typing import List, Optional, Sequence import bcrypt from mgr_module import CLICheckNonemptyFileInput, CLIReadCommand, CLIWriteCommand +from mgr_util import password_hash from .. import mgr from ..exceptions import PasswordPolicyException, PermissionNotValid, \ @@ -27,17 +28,6 @@ logger = logging.getLogger('access_control') DEFAULT_FILE_DESC = 'password/secret' -# password hashing algorithm -def password_hash(password, salt_password=None): - if not password: - return None - if not salt_password: - salt_password = bcrypt.gensalt() - else: - salt_password = salt_password.encode('utf8') - return bcrypt.hashpw(password.encode('utf8'), salt_password).decode('utf8') - - _P = Permission # short alias diff --git a/src/pybind/mgr/dashboard/tests/test_prometheus.py b/src/pybind/mgr/dashboard/tests/test_prometheus.py index cd2fb3e8dd3..21c4a0b10e9 100644 --- a/src/pybind/mgr/dashboard/tests/test_prometheus.py +++ b/src/pybind/mgr/dashboard/tests/test_prometheus.py @@ -26,43 +26,49 @@ class PrometheusControllerTest(ControllerTestCase): mgr.get_module_option.side_effect = settings.get cls.setup_controllers([Prometheus, PrometheusNotifications, PrometheusReceiver]) + @patch("dashboard.controllers.prometheus.mgr.get_module_option_ex", lambda a, b, c: False) def test_rules(self): with patch('requests.request') as mock_request: self._get('/api/prometheus/rules') mock_request.assert_called_with('GET', self.prometheus_host_api + '/rules', - json=None, params={}, verify=True) + json=None, params={}, verify=True, auth=None) + @patch("dashboard.controllers.prometheus.mgr.get_module_option_ex", lambda a, b, c: False) def test_list(self): with patch('requests.request') as mock_request: self._get('/api/prometheus') mock_request.assert_called_with('GET', self.alert_host_api + '/alerts', - json=None, params={}, verify=True) + json=None, params={}, verify=True, auth=None) + @patch("dashboard.controllers.prometheus.mgr.get_module_option_ex", lambda a, b, c: False) def test_get_silences(self): with patch('requests.request') as mock_request: self._get('/api/prometheus/silences') mock_request.assert_called_with('GET', self.alert_host_api + '/silences', - json=None, params={}, verify=True) + json=None, params={}, verify=True, auth=None) + @patch("dashboard.controllers.prometheus.mgr.get_module_option_ex", lambda a, b, c: False) def test_add_silence(self): with patch('requests.request') as mock_request: self._post('/api/prometheus/silence', {'id': 'new-silence'}) mock_request.assert_called_with('POST', self.alert_host_api + '/silences', params=None, json={'id': 'new-silence'}, - verify=True) + verify=True, auth=None) + @patch("dashboard.controllers.prometheus.mgr.get_module_option_ex", lambda a, b, c: False) def test_update_silence(self): with patch('requests.request') as mock_request: self._post('/api/prometheus/silence', {'id': 'update-silence'}) mock_request.assert_called_with('POST', self.alert_host_api + '/silences', params=None, json={'id': 'update-silence'}, - verify=True) + verify=True, auth=None) + @patch("dashboard.controllers.prometheus.mgr.get_module_option_ex", lambda a, b, c: False) def test_expire_silence(self): with patch('requests.request') as mock_request: self._delete('/api/prometheus/silence/0') mock_request.assert_called_with('DELETE', self.alert_host_api + '/silence/0', - json=None, params=None, verify=True) + json=None, params=None, verify=True, auth=None) def test_silences_empty_delete(self): with patch('requests.request') as mock_request: diff --git a/src/pybind/mgr/mgr_util.py b/src/pybind/mgr/mgr_util.py index 8c1e5be4416..721660ec2a0 100644 --- a/src/pybind/mgr/mgr_util.py +++ b/src/pybind/mgr/mgr_util.py @@ -3,6 +3,7 @@ import os if 'UNITTEST' in os.environ: import tests +import bcrypt import cephfs import contextlib import datetime @@ -873,3 +874,13 @@ def profile_method(skip_attribute: bool = False) -> Callable[[Callable[..., T]], return result return wrapper return outer + + +def password_hash(password: Optional[str], salt_password: Optional[str] = None) -> Optional[str]: + if not password: + return None + if not salt_password: + salt = bcrypt.gensalt() + else: + salt = salt_password.encode('utf8') + return bcrypt.hashpw(password.encode('utf8'), salt).decode('utf8') diff --git a/src/pybind/mgr/orchestrator/_interface.py b/src/pybind/mgr/orchestrator/_interface.py index 2972d92aa18..40235553227 100644 --- a/src/pybind/mgr/orchestrator/_interface.py +++ b/src/pybind/mgr/orchestrator/_interface.py @@ -660,6 +660,14 @@ class Orchestrator(object): """Update prometheus cluster""" raise NotImplementedError() + def get_prometheus_access_info(self) -> OrchResult[Dict[str, str]]: + """get prometheus access information""" + raise NotImplementedError() + + def get_alertmanager_access_info(self) -> OrchResult[Dict[str, str]]: + """get alertmanager access information""" + raise NotImplementedError() + def apply_node_exporter(self, spec: ServiceSpec) -> OrchResult[str]: """Update existing a Node-Exporter daemon(s)""" raise NotImplementedError() diff --git a/src/pybind/mgr/orchestrator/module.py b/src/pybind/mgr/orchestrator/module.py index cc97612ad6a..9c98736b64a 100644 --- a/src/pybind/mgr/orchestrator/module.py +++ b/src/pybind/mgr/orchestrator/module.py @@ -869,6 +869,18 @@ class OrchestratorCli(OrchestratorClientMixin, MgrModule, return HandleCommandResult(stdout=table.get_string()) + @_cli_write_command('orch prometheus access info') + def _get_prometheus_access_info(self) -> HandleCommandResult: + completion = self.get_prometheus_access_info() + access_info = raise_if_exception(completion) + return HandleCommandResult(stdout=json.dumps(access_info)) + + @_cli_write_command('orch alertmanager access info') + def _get_alertmanager_access_info(self) -> HandleCommandResult: + completion = self.get_alertmanager_access_info() + access_info = raise_if_exception(completion) + return HandleCommandResult(stdout=json.dumps(access_info)) + @_cli_write_command('orch apply osd') def _apply_osd(self, all_available_devices: bool = False, diff --git a/src/pybind/mgr/prometheus/module.py b/src/pybind/mgr/prometheus/module.py index 069330c392c..b34b865e320 100644 --- a/src/pybind/mgr/prometheus/module.py +++ b/src/pybind/mgr/prometheus/module.py @@ -1,4 +1,5 @@ import cherrypy +import yaml from collections import defaultdict from pkg_resources import packaging # type: ignore import json @@ -8,11 +9,12 @@ import re import threading import time import enum +from collections import namedtuple + from mgr_module import CLIReadCommand, MgrModule, MgrStandbyModule, PG_STATES, Option, ServiceInfoT, HandleCommandResult, CLIWriteCommand from mgr_util import get_default_addr, profile_method, build_url from rbd import RBD -from collections import namedtuple -import yaml +from cephadm.ssl_cert_utils import SSLCerts from typing import DefaultDict, Optional, Dict, Any, Set, cast, Tuple, Union, List, Callable @@ -635,6 +637,7 @@ class Module(MgrModule): _global_instance = self self.metrics_thread = MetricCollectionThread(_global_instance) self.health_history = HealthHistory(self) + self.ssl_certs = SSLCerts() def _setup_static_metrics(self) -> Dict[str, Metric]: metrics = {} @@ -856,9 +859,9 @@ class Module(MgrModule): self.log.info('Restarting engine...') cherrypy.engine.stop() cherrypy.server.httpserver = None + server_addr = cast(str, self.get_localized_module_option('server_addr', get_default_addr())) server_port = cast(int, self.get_localized_module_option('server_port', DEFAULT_PORT)) - self.set_uri(build_url(scheme='http', host=self.get_server_addr(), port=server_port, path='/')) - cherrypy.config.update({'server.socket_port': server_port}) + self.configure(server_addr, server_port) cherrypy.engine.start() self.log.info('Engine started.') @@ -1722,6 +1725,50 @@ class Module(MgrModule): self.collect() self.get_file_sd_config() + def configure(self, server_addr: str, server_port: int) -> None: + secure_monitoring_stack = self.get_module_option_ex( + 'cephadm', 'secure_monitoring_stack', False) + if secure_monitoring_stack: + self.generate_tls_certificates(self.get_mgr_ip()) + cherrypy.config.update({ + 'server.socket_host': server_addr, + 'server.socket_port': server_port, + 'engine.autoreload.on': False, + 'server.ssl_module': 'builtin', + 'server.ssl_certificate': self.cert_file, + 'server.ssl_private_key': self.key_file, + }) + # Publish the URI that others may use to access the service we're about to start serving + self.set_uri(build_url(scheme='https', host=self.get_server_addr(), + port=server_port, path='/')) + else: + cherrypy.config.update({ + 'server.socket_host': server_addr, + 'server.socket_port': server_port, + 'engine.autoreload.on': False, + 'server.ssl_module': None, + 'server.ssl_certificate': None, + 'server.ssl_private_key': None, + }) + # Publish the URI that others may use to access the service we're about to start serving + self.set_uri(build_url(scheme='http', host=self.get_server_addr(), + port=server_port, path='/')) + + def generate_tls_certificates(self, host: str) -> None: + try: + old_cert = self.get_store('root/cert') + old_key = self.get_store('root/key') + if not old_cert or not old_key: + raise Exception('No old credentials for mgr-prometheus endpoint') + self.ssl_certs.load_root_credentials(old_cert, old_key) + except Exception: + self.ssl_certs.generate_root_cert(host) + self.set_store('root/cert', self.ssl_certs.get_root_cert()) + self.set_store('root/key', self.ssl_certs.get_root_key()) + + self.cert_file, self.key_file = self.ssl_certs.generate_cert_files( + self.get_hostname(), host) + def serve(self) -> None: class Root(object): @@ -1802,10 +1849,8 @@ class Module(MgrModule): self.STALE_CACHE_RETURN]: self.stale_cache_strategy = self.STALE_CACHE_FAIL - server_addr = cast(str, self.get_localized_module_option( - 'server_addr', get_default_addr())) - server_port = cast(int, self.get_localized_module_option( - 'server_port', DEFAULT_PORT)) + server_addr = cast(str, self.get_localized_module_option('server_addr', get_default_addr())) + server_port = cast(int, self.get_localized_module_option('server_port', DEFAULT_PORT)) self.log.info( "server_addr: %s server_port: %s" % (server_addr, server_port) @@ -1818,19 +1863,13 @@ class Module(MgrModule): else: self.log.info('Cache disabled') - cherrypy.config.update({ - 'server.socket_host': server_addr, - 'server.socket_port': server_port, - 'engine.autoreload.on': False - }) - # Publish the URI that others may use to access the service we're - # about to start serving - self.set_uri(build_url(scheme='http', host=self.get_server_addr(), port=server_port, path='/')) + self.configure(server_addr, server_port) cherrypy.tree.mount(Root(), "/") self.log.info('Starting engine...') cherrypy.engine.start() self.log.info('Engine started.') + # wait for the shutdown event self.shutdown_event.wait() self.shutdown_event.clear() diff --git a/src/pybind/mgr/requirements-required.txt b/src/pybind/mgr/requirements-required.txt index 1fad65c626f..76fef65dbe4 100644 --- a/src/pybind/mgr/requirements-required.txt +++ b/src/pybind/mgr/requirements-required.txt @@ -15,3 +15,4 @@ scipy setuptools werkzeug natsort +bcrypt -- 2.39.5