From: Avan Thakkar Date: Tue, 25 Jan 2022 18:56:20 +0000 (+0530) Subject: mgr/cephadm: introducing orch apply daemon promtail X-Git-Tag: v18.0.0~1249^2~6 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=b576d95f8cf288cb9ca242375a99b6bc460ecb8b;p=ceph.git mgr/cephadm: introducing orch apply daemon promtail Signed-off-by: Avan Thakkar Co-authored-by: Aashish Sharma --- diff --git a/src/cephadm/cephadm b/src/cephadm/cephadm index 13f68438fd9b..58c8f225abb4 100755 --- a/src/cephadm/cephadm +++ b/src/cephadm/cephadm @@ -49,7 +49,6 @@ DEFAULT_IMAGE = 'quay.ceph.io/ceph-ci/ceph:master' DEFAULT_IMAGE_IS_MASTER = True DEFAULT_IMAGE_RELEASE = 'quincy' DEFAULT_PROMETHEUS_IMAGE = 'quay.io/prometheus/prometheus:v2.18.1' -DEFAULT_CADVISOR_IMAGE = 'gcr.io/cadvisor/cadvisor:v0.39.3' DEFAULT_LOKI_IMAGE = 'docker.io/grafana/loki:2.4.0' DEFAULT_PROMTAIL_IMAGE = 'docker.io/grafana/promtail:2.4.0' DEFAULT_NODE_EXPORTER_IMAGE = 'quay.io/prometheus/node-exporter:v0.18.1' @@ -429,8 +428,8 @@ class Monitoring(object): 'node-exporter': [9100], 'grafana': [3000], 'alertmanager': [9093, 9094], - 'cAdvisor': [8080], - 'loki': [3100] + 'loki': [3100], + 'promtail': [9080] } components = { @@ -446,11 +445,6 @@ class Monitoring(object): 'prometheus.yml', ], }, - 'cAdvisor': { - 'image': DEFAULT_CADVISOR_IMAGE, - 'cpus': '2', - 'memory': '1GB' - }, 'loki': { 'image': DEFAULT_LOKI_IMAGE, 'cpus': '1', @@ -515,7 +509,7 @@ class Monitoring(object): """ :param: daemon_type Either "prometheus", "alertmanager" or "node-exporter" """ - assert daemon_type in ('prometheus', 'alertmanager', 'node-exporter', 'loki') + assert daemon_type in ('prometheus', 'alertmanager', 'node-exporter', 'loki', 'promtail') cmd = daemon_type.replace('-', '_') code = -1 err = '' @@ -2365,7 +2359,7 @@ def get_daemon_args(ctx, fsid, daemon_type, daemon_id): metadata = Monitoring.components[daemon_type] r += metadata.get('args', list()) # set ip and port to bind to for nodeexporter,alertmanager,prometheus - if daemon_type != 'grafana' and daemon_type != 'loki': + if daemon_type != 'grafana' and daemon_type != 'loki' and daemon_type != 'promtail': ip = '' port = Monitoring.port_map[daemon_type][0] if 'meta_json' in ctx and ctx.meta_json: @@ -2384,6 +2378,8 @@ def get_daemon_args(ctx, fsid, daemon_type, daemon_id): r += ['--config.file=/etc/alertmanager/alertmanager.yml'] if daemon_type == 'loki': r += ['--config.file=/etc/loki/loki.yml'] + if daemon_type == 'promtail': + r += ['--config.file=/etc/promtail/promtail.yml'] elif daemon_type == NFSGanesha.daemon_type: nfs_ganesha = NFSGanesha.init(ctx, fsid, daemon_id) r += nfs_ganesha.get_daemon_args() @@ -2457,6 +2453,7 @@ def create_daemon_dirs(ctx, fsid, daemon_type, daemon_id, uid, gid, daemon_type, daemon_id) config_dir = 'etc/promtail' makedirs(os.path.join(data_dir_root, config_dir), uid, gid, 0o755) + makedirs(os.path.join(data_dir_root, 'data'), uid, gid, 0o755) elif daemon_type == 'loki': data_dir_root = get_data_dir(fsid, ctx.data_dir, daemon_type, daemon_id) @@ -2648,19 +2645,16 @@ def get_container_mounts(ctx, fsid, daemon_type, daemon_id, if daemon_type in Monitoring.components and daemon_id: data_dir = get_data_dir(fsid, ctx.data_dir, daemon_type, daemon_id) + log_dir = get_log_dir(fsid, ctx.log_dir) if daemon_type == 'prometheus': mounts[os.path.join(data_dir, 'etc/prometheus')] = '/etc/prometheus:Z' mounts[os.path.join(data_dir, 'data')] = '/prometheus:Z' - elif daemon_type == 'cAdvisor': - mounts['/sys'] = '/sys:ro' - mounts['/'] = '/rootfs:ro' - mounts['/var/run'] = '/var/run:rw' - mounts['/var/lib/docker'] = '/var/lib/docker:ro' elif daemon_type == 'loki': mounts[os.path.join(data_dir, 'etc/loki')] = '/etc/loki:Z' mounts[os.path.join(data_dir, 'data')] = '/loki:Z' elif daemon_type == 'promtail': mounts[os.path.join(data_dir, 'etc/promtail')] = '/etc/promtail:Z' + mounts[log_dir] = '/var/log/ceph:z' mounts[os.path.join(data_dir, 'data')] = '/promtail:Z' elif daemon_type == 'node-exporter': mounts['/proc'] = '/host/proc:ro' @@ -4755,7 +4749,7 @@ def prepare_ssh( cli(['orch', 'apply', 'crash']) if not ctx.skip_monitoring_stack: - for t in ['prometheus', 'grafana', 'node-exporter', 'alertmanager', 'loki']: + for t in ['prometheus', 'grafana', 'node-exporter', 'alertmanager', 'loki', 'promtail']: logger.info('Deploying %s service with default placement...' % t) cli(['orch', 'apply', t]) @@ -5927,7 +5921,8 @@ def list_daemons(ctx, detail=True, legacy_dir=None): elif daemon_type in ['prometheus', 'alertmanager', 'node-exporter', - 'loki']: + 'loki', + 'promtail']: version = Monitoring.get_version(ctx, container_id, daemon_type) seen_versions[image_id] = version elif daemon_type == 'haproxy': @@ -6076,6 +6071,8 @@ def command_adopt(ctx): command_adopt_prometheus(ctx, daemon_id, fsid) elif daemon_type == 'loki': command_adopt_loki(ctx, daemon_id, fsid) + elif daemon_type == 'promtail': + command_adopt_promtail(ctx, daemon_id, fsid) elif daemon_type == 'grafana': command_adopt_grafana(ctx, daemon_id, fsid) elif daemon_type == 'node-exporter': @@ -6340,6 +6337,35 @@ def command_adopt_loki(ctx, daemon_id, fsid): update_firewalld(ctx, daemon_type) +def command_adopt_promtail(ctx, daemon_id, fsid): + # type: (CephadmContext, str, str) -> None + daemon_type = 'promtail' + (uid, gid) = extract_uid_gid_monitoring(ctx, daemon_type) + + _stop_and_disable(ctx, 'promtail') + + data_dir_dst = make_data_dir(ctx, fsid, daemon_type, daemon_id, + uid=uid, gid=gid) + + # config + config_src = '/etc/promtail/promtail.yml' + config_src = os.path.abspath(ctx.legacy_dir + config_src) + config_dst = os.path.join(data_dir_dst, 'etc/promtail') + makedirs(config_dst, uid, gid, 0o755) + copy_files(ctx, [config_src], config_dst, uid=uid, gid=gid) + + # data + data_src = '/var/lib/promtail' + data_src = os.path.abspath(ctx.legacy_dir + data_src) + data_dst = os.path.join(data_dir_dst, 'data') + copy_tree(ctx, [data_src], data_dst, uid=uid, gid=gid) + + make_var_run(ctx, fsid, uid, gid) + c = get_container(ctx, fsid, daemon_type, daemon_id) + deploy_daemon(ctx, fsid, daemon_type, daemon_id, c, uid, gid) + update_firewalld(ctx, daemon_type) + + def command_adopt_grafana(ctx, daemon_id, fsid): # type: (CephadmContext, str, str) -> None @@ -8418,7 +8444,7 @@ def _get_parser(): parser_bootstrap.add_argument( '--skip-monitoring-stack', action='store_true', - help='Do not automatically provision monitoring stack (prometheus, grafana, alertmanager, node-exporter, loki)') + help='Do not automatically provision monitoring stack (prometheus, grafana, alertmanager, node-exporter, loki, promtail)') parser_bootstrap.add_argument( '--apply-spec', help='Apply cluster spec after bootstrap (copy ssh key, add hosts and apply services)') diff --git a/src/pybind/mgr/cephadm/module.py b/src/pybind/mgr/cephadm/module.py index 0b93721bf67d..bce2bd98b540 100644 --- a/src/pybind/mgr/cephadm/module.py +++ b/src/pybind/mgr/cephadm/module.py @@ -52,7 +52,7 @@ from .services.iscsi import IscsiService from .services.nfs import NFSService from .services.osd import OSDRemovalQueue, OSDService, OSD, NotFoundError from .services.monitoring import GrafanaService, AlertmanagerService, PrometheusService, \ - NodeExporterService, SNMPGatewayService, LokiService + NodeExporterService, SNMPGatewayService, LokiService, PromtailService from .schedule import HostAssignment from .inventory import Inventory, SpecStore, HostCache, AgentCache, EventStore, \ ClientKeyringStore, ClientKeyringSpec @@ -204,6 +204,11 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule, default=DEFAULT_LOKI_IMAGE, desc='Loki container image', ), + Option( + 'container_image_promtail', + default=DEFAULT_PROMTAIL_IMAGE, + desc='Promtail container image', + ), Option( 'container_image_haproxy', default=DEFAULT_HAPROXY_IMAGE, @@ -513,7 +518,7 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule, _service_clses: Sequence[Type[CephadmService]] = [ OSDService, NFSService, MonService, MgrService, MdsService, RgwService, RbdMirrorService, GrafanaService, AlertmanagerService, - PrometheusService, NodeExporterService, LokiService, CrashService, IscsiService, + PrometheusService, NodeExporterService, LokiService, PromtailService, CrashService, IscsiService, IngressService, CustomContainerService, CephfsMirrorService, CephadmAgent, SNMPGatewayService ] @@ -665,7 +670,7 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule, suffix = daemon_type not in [ 'mon', 'crash', 'prometheus', 'node-exporter', 'grafana', 'alertmanager', - 'container', 'agent', 'snmp-gateway', 'loki' + 'container', 'agent', 'snmp-gateway', 'loki', 'promtail' ] if forcename: if len([d for d in existing if d.daemon_id == forcename]): @@ -1303,6 +1308,8 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule, image = self.container_image_node_exporter elif daemon_type == 'loki': image = self.container_image_loki + elif daemon_type == 'promtail': + image = self.container_image_promtail elif daemon_type == 'haproxy': image = self.container_image_haproxy elif daemon_type == 'keepalived': @@ -2456,6 +2463,7 @@ Then run the following: 'prometheus': PlacementSpec(count=1), 'node-exporter': PlacementSpec(host_pattern='*'), 'loki': PlacementSpec(count=1), + 'promtail': PlacementSpec(host_pattern='*'), 'crash': PlacementSpec(host_pattern='*'), 'container': PlacementSpec(count=1), 'snmp-gateway': PlacementSpec(count=1), @@ -2555,6 +2563,10 @@ Then run the following: def apply_loki(self, spec: ServiceSpec) -> str: return self._apply(spec) + @handle_orch_error + def apply_promtail(self, spec: ServiceSpec) -> str: + return self._apply(spec) + @handle_orch_error def apply_node_exporter(self, spec: ServiceSpec) -> str: return self._apply(spec) diff --git a/src/pybind/mgr/cephadm/services/monitoring.py b/src/pybind/mgr/cephadm/services/monitoring.py index f88acc9f46a8..fa89dc27b4a8 100644 --- a/src/pybind/mgr/cephadm/services/monitoring.py +++ b/src/pybind/mgr/cephadm/services/monitoring.py @@ -402,6 +402,34 @@ class LokiService(CephadmService): }, sorted(deps) +class PromtailService(CephadmService): + TYPE = 'promtail' + + def prepare_create(self, daemon_spec: CephadmDaemonDeploySpec) -> CephadmDaemonDeploySpec: + assert self.TYPE == daemon_spec.daemon_type + daemon_spec.final_config, daemon_spec.deps = self.generate_config(daemon_spec) + return daemon_spec + + def generate_config(self, daemon_spec: CephadmDaemonDeploySpec) -> Tuple[Dict[str, Any], List[str]]: + assert self.TYPE == daemon_spec.daemon_type + deps: List[str] = [] + hostnames: List[str] = [] + for dd in self.mgr.cache.get_daemons_by_service('mgr'): + addr = self.mgr.inventory.get_addr(dd.hostname) + hostnames.append(addr) + context = { + 'hostnames': hostnames, + 'client_hostname': hostnames[0], + } + + yml = self.mgr.template.render('services/promtail.yml.j2', context) + return { + "files": { + "promtail.yml": yml + } + }, sorted(deps) + + class SNMPGatewayService(CephadmService): TYPE = 'snmp-gateway' diff --git a/src/pybind/mgr/cephadm/templates/services/promtail.yml.j2 b/src/pybind/mgr/cephadm/templates/services/promtail.yml.j2 index 33e566492d3f..651ee43e5111 100644 --- a/src/pybind/mgr/cephadm/templates/services/promtail.yml.j2 +++ b/src/pybind/mgr/cephadm/templates/services/promtail.yml.j2 @@ -5,14 +5,16 @@ server: positions: filename: /tmp/positions.yaml -client: - url: http:localhost:3100/loki/api/v1/push +clients: + - url: http://{{ client_hostname }}:3100/loki/api/v1/push scrape_configs: -- job_name: docker - pipeline_stages: - - docker: {} +- job_name: system static_configs: - - labels: - job: docker - __path__: /var/lib/docker/containers//-json.log \ No newline at end of file + - targets: +{% for url in hostnames %} + - {{ url }} +{% endfor %} + labels: + job: Cluster Logs + __path__: /var/log/ceph/**/*.log diff --git a/src/pybind/mgr/orchestrator/_interface.py b/src/pybind/mgr/orchestrator/_interface.py index 6467e21a9592..b62ef9019fac 100644 --- a/src/pybind/mgr/orchestrator/_interface.py +++ b/src/pybind/mgr/orchestrator/_interface.py @@ -467,6 +467,7 @@ class Orchestrator(object): 'osd': lambda dg: self.apply_drivegroups([dg]), # type: ignore 'prometheus': self.apply_prometheus, 'loki': self.apply_loki, + 'promtail': self.apply_promtail, 'rbd-mirror': self.apply_rbd_mirror, 'rgw': self.apply_rgw, 'ingress': self.apply_ingress, @@ -644,6 +645,10 @@ class Orchestrator(object): def apply_loki(self, spec: ServiceSpec) -> OrchResult[str]: """Update existing a Loki daemon(s)""" raise NotImplementedError() + + def apply_promtail(self, spec: ServiceSpec) -> OrchResult[str]: + """Update existing a Promtail daemon(s)""" + raise NotImplementedError() def apply_crash(self, spec: ServiceSpec) -> OrchResult[str]: """Update existing a crash daemon(s)""" @@ -726,6 +731,7 @@ def daemon_type_to_service(dtype: str) -> str: 'prometheus': 'prometheus', 'node-exporter': 'node-exporter', 'loki': 'loki', + 'promtail': 'promtail', 'crash': 'crash', 'crashcollector': 'crash', # Specific Rook Daemon 'container': 'container', @@ -751,6 +757,7 @@ def service_to_daemon_types(stype: str) -> List[str]: 'alertmanager': ['alertmanager'], 'prometheus': ['prometheus'], 'loki': ['loki'], + 'promtail': ['promtail'], 'node-exporter': ['node-exporter'], 'crash': ['crash'], 'container': ['container'], diff --git a/src/pybind/mgr/orchestrator/module.py b/src/pybind/mgr/orchestrator/module.py index c0e9e585f91a..a1a966da57e0 100644 --- a/src/pybind/mgr/orchestrator/module.py +++ b/src/pybind/mgr/orchestrator/module.py @@ -57,6 +57,7 @@ class ServiceType(enum.Enum): node_exporter = 'node-exporter' prometheus = 'prometheus' loki = 'loki' + promtail = 'promtail' mds = 'mds' rgw = 'rgw' nfs = 'nfs' diff --git a/src/python-common/ceph/deployment/service_spec.py b/src/python-common/ceph/deployment/service_spec.py index d409b9228552..baeb7e7ff1cf 100644 --- a/src/python-common/ceph/deployment/service_spec.py +++ b/src/python-common/ceph/deployment/service_spec.py @@ -437,7 +437,7 @@ class ServiceSpec(object): This structure is supposed to be enough information to start the services. """ - KNOWN_SERVICE_TYPES = 'alertmanager crash grafana iscsi loki mds mgr mon nfs ' \ + KNOWN_SERVICE_TYPES = 'alertmanager crash grafana iscsi loki promtail mds mgr mon nfs ' \ 'node-exporter osd prometheus rbd-mirror rgw agent ' \ 'container ingress cephfs-mirror snmp-gateway'.split() REQUIRES_SERVICE_ID = 'iscsi mds nfs rgw container ingress '.split() @@ -461,6 +461,7 @@ class ServiceSpec(object): 'node-exporter': MonitoringSpec, 'prometheus': MonitoringSpec, 'loki': MonitoringSpec, + 'promtail': MonitoringSpec, 'snmp-gateway': SNMPGatewaySpec, }.get(service_type, cls) if ret == ServiceSpec and not service_type: @@ -1055,7 +1056,8 @@ class MonitoringSpec(ServiceSpec): preview_only: bool = False, port: Optional[int] = None, ): - assert service_type in ['grafana', 'node-exporter', 'prometheus', 'alertmanager', 'loki'] + assert service_type in ['grafana', 'node-exporter', 'prometheus', 'alertmanager', + 'loki', 'promtail'] super(MonitoringSpec, self).__init__( service_type, service_id, @@ -1077,7 +1079,8 @@ class MonitoringSpec(ServiceSpec): 'node-exporter': 9100, 'alertmanager': 9093, 'grafana': 3000, - 'loki': 3100}[self.service_type] + 'loki': 3100, + 'promtail': 9080}[self.service_type] yaml.add_representer(MonitoringSpec, ServiceSpec.yaml_representer)