From 5dc96cb62165109338e6e13a259e5e6dc2b5a8ce Mon Sep 17 00:00:00 2001 From: Avan Thakkar Date: Tue, 25 Jan 2022 04:07:45 +0530 Subject: [PATCH] mgr/cephadm: introducing orch apply daemon loki Signed-off-by: Avan Thakkar --- src/cephadm/cephadm | 55 ++++++++++++++++--- src/pybind/mgr/cephadm/module.py | 21 ++++++- src/pybind/mgr/cephadm/services/monitoring.py | 24 ++++++++ .../cephadm/templates/services/loki.yml.j2 | 26 +++++++++ .../templates/services/promtail.yml.j2 | 18 ++++++ src/pybind/mgr/orchestrator/_interface.py | 7 +++ src/pybind/mgr/orchestrator/module.py | 1 + .../ceph/deployment/service_spec.py | 8 ++- 8 files changed, 147 insertions(+), 13 deletions(-) create mode 100644 src/pybind/mgr/cephadm/templates/services/loki.yml.j2 create mode 100644 src/pybind/mgr/cephadm/templates/services/promtail.yml.j2 diff --git a/src/cephadm/cephadm b/src/cephadm/cephadm index 490e506d276..13f68438fd9 100755 --- a/src/cephadm/cephadm +++ b/src/cephadm/cephadm @@ -50,8 +50,8 @@ DEFAULT_IMAGE_IS_MASTER = True DEFAULT_IMAGE_RELEASE = 'quincy' DEFAULT_PROMETHEUS_IMAGE = 'quay.io/prometheus/prometheus:v2.18.1' DEFAULT_CADVISOR_IMAGE = 'gcr.io/cadvisor/cadvisor:v0.39.3' -DEFAULT_LOKI_IMAGE = 'docker.io/r/grafana/loki:2.4.0' -DEFAULT_PROMTAIL_IMAGE = 'docker.io/r/grafana/promtail:2.4.0' +DEFAULT_LOKI_IMAGE = 'docker.io/grafana/loki:2.4.0' +DEFAULT_PROMTAIL_IMAGE = 'docker.io/grafana/promtail:2.4.0' DEFAULT_NODE_EXPORTER_IMAGE = 'quay.io/prometheus/node-exporter:v0.18.1' DEFAULT_ALERT_MANAGER_IMAGE = 'quay.io/prometheus/alertmanager:v0.20.0' DEFAULT_GRAFANA_IMAGE = 'quay.io/ceph/ceph-grafana:6.7.4' @@ -515,7 +515,7 @@ class Monitoring(object): """ :param: daemon_type Either "prometheus", "alertmanager" or "node-exporter" """ - assert daemon_type in ('prometheus', 'alertmanager', 'node-exporter') + assert daemon_type in ('prometheus', 'alertmanager', 'node-exporter', 'loki') cmd = daemon_type.replace('-', '_') code = -1 err = '' @@ -2365,7 +2365,7 @@ def get_daemon_args(ctx, fsid, daemon_type, daemon_id): metadata = Monitoring.components[daemon_type] r += metadata.get('args', list()) # set ip and port to bind to for nodeexporter,alertmanager,prometheus - if daemon_type != 'grafana': + if daemon_type != 'grafana' and daemon_type != 'loki': ip = '' port = Monitoring.port_map[daemon_type][0] if 'meta_json' in ctx and ctx.meta_json: @@ -2382,6 +2382,8 @@ def get_daemon_args(ctx, fsid, daemon_type, daemon_id): r += ['--cluster.peer={}'.format(peer)] # some alertmanager, by default, look elsewhere for a config r += ['--config.file=/etc/alertmanager/alertmanager.yml'] + if daemon_type == 'loki': + r += ['--config.file=/etc/loki/loki.yml'] elif daemon_type == NFSGanesha.daemon_type: nfs_ganesha = NFSGanesha.init(ctx, fsid, daemon_id) r += nfs_ganesha.get_daemon_args() @@ -2460,6 +2462,7 @@ def create_daemon_dirs(ctx, fsid, daemon_type, daemon_id, uid, gid, daemon_type, daemon_id) config_dir = 'etc/loki' makedirs(os.path.join(data_dir_root, config_dir), uid, gid, 0o755) + makedirs(os.path.join(data_dir_root, 'data'), uid, gid, 0o755) # populate the config directory for the component from the config-json if 'files' in config_json: @@ -2655,8 +2658,10 @@ def get_container_mounts(ctx, fsid, daemon_type, daemon_id, mounts['/var/lib/docker'] = '/var/lib/docker:ro' elif daemon_type == 'loki': mounts[os.path.join(data_dir, 'etc/loki')] = '/etc/loki:Z' + mounts[os.path.join(data_dir, 'data')] = '/loki:Z' elif daemon_type == 'promtail': mounts[os.path.join(data_dir, 'etc/promtail')] = '/etc/promtail:Z' + mounts[os.path.join(data_dir, 'data')] = '/promtail:Z' elif daemon_type == 'node-exporter': mounts['/proc'] = '/host/proc:ro' mounts['/sys'] = '/host/sys:ro' @@ -4750,7 +4755,7 @@ def prepare_ssh( cli(['orch', 'apply', 'crash']) if not ctx.skip_monitoring_stack: - for t in ['prometheus', 'grafana', 'node-exporter', 'alertmanager']: + for t in ['prometheus', 'grafana', 'node-exporter', 'alertmanager', 'loki']: logger.info('Deploying %s service with default placement...' % t) cli(['orch', 'apply', t]) @@ -5272,6 +5277,10 @@ def extract_uid_gid_monitoring(ctx, daemon_type): uid, gid = 65534, 65534 elif daemon_type == 'grafana': uid, gid = extract_uid_gid(ctx, file_path='/var/lib/grafana') + elif daemon_type == 'loki': + uid, gid = extract_uid_gid(ctx, file_path='/etc/loki') + elif daemon_type == 'promtail': + uid, gid = extract_uid_gid(ctx, file_path='/etc/promtail') elif daemon_type == 'alertmanager': uid, gid = extract_uid_gid(ctx, file_path=['/etc/alertmanager', '/etc/prometheus']) else: @@ -5917,7 +5926,8 @@ def list_daemons(ctx, detail=True, legacy_dir=None): seen_versions[image_id] = version elif daemon_type in ['prometheus', 'alertmanager', - 'node-exporter']: + 'node-exporter', + 'loki']: version = Monitoring.get_version(ctx, container_id, daemon_type) seen_versions[image_id] = version elif daemon_type == 'haproxy': @@ -6064,6 +6074,8 @@ def command_adopt(ctx): command_adopt_ceph(ctx, daemon_type, daemon_id, fsid) elif daemon_type == 'prometheus': command_adopt_prometheus(ctx, daemon_id, fsid) + elif daemon_type == 'loki': + command_adopt_loki(ctx, daemon_id, fsid) elif daemon_type == 'grafana': command_adopt_grafana(ctx, daemon_id, fsid) elif daemon_type == 'node-exporter': @@ -6299,6 +6311,35 @@ def command_adopt_prometheus(ctx, daemon_id, fsid): update_firewalld(ctx, daemon_type) +def command_adopt_loki(ctx, daemon_id, fsid): + # type: (CephadmContext, str, str) -> None + daemon_type = 'loki' + (uid, gid) = extract_uid_gid_monitoring(ctx, daemon_type) + + _stop_and_disable(ctx, 'loki') + + data_dir_dst = make_data_dir(ctx, fsid, daemon_type, daemon_id, + uid=uid, gid=gid) + + # config + config_src = '/etc/loki/loki.yml' + config_src = os.path.abspath(ctx.legacy_dir + config_src) + config_dst = os.path.join(data_dir_dst, 'etc/loki') + makedirs(config_dst, uid, gid, 0o755) + copy_files(ctx, [config_src], config_dst, uid=uid, gid=gid) + + # data + data_src = '/var/lib/loki' + data_src = os.path.abspath(ctx.legacy_dir + data_src) + data_dst = os.path.join(data_dir_dst, 'data') + copy_tree(ctx, [data_src], data_dst, uid=uid, gid=gid) + + make_var_run(ctx, fsid, uid, gid) + c = get_container(ctx, fsid, daemon_type, daemon_id) + deploy_daemon(ctx, fsid, daemon_type, daemon_id, c, uid, gid) + update_firewalld(ctx, daemon_type) + + def command_adopt_grafana(ctx, daemon_id, fsid): # type: (CephadmContext, str, str) -> None @@ -8377,7 +8418,7 @@ def _get_parser(): parser_bootstrap.add_argument( '--skip-monitoring-stack', action='store_true', - help='Do not automatically provision monitoring stack (prometheus, grafana, alertmanager, node-exporter)') + help='Do not automatically provision monitoring stack (prometheus, grafana, alertmanager, node-exporter, loki)') parser_bootstrap.add_argument( '--apply-spec', help='Apply cluster spec after bootstrap (copy ssh key, add hosts and apply services)') diff --git a/src/pybind/mgr/cephadm/module.py b/src/pybind/mgr/cephadm/module.py index ae428249e46..0b93721bf67 100644 --- a/src/pybind/mgr/cephadm/module.py +++ b/src/pybind/mgr/cephadm/module.py @@ -52,7 +52,7 @@ from .services.iscsi import IscsiService from .services.nfs import NFSService from .services.osd import OSDRemovalQueue, OSDService, OSD, NotFoundError from .services.monitoring import GrafanaService, AlertmanagerService, PrometheusService, \ - NodeExporterService, SNMPGatewayService + NodeExporterService, SNMPGatewayService, LokiService from .schedule import HostAssignment from .inventory import Inventory, SpecStore, HostCache, AgentCache, EventStore, \ ClientKeyringStore, ClientKeyringSpec @@ -93,6 +93,8 @@ os._exit = os_exit_noop # type: ignore DEFAULT_IMAGE = 'quay.io/ceph/ceph' DEFAULT_PROMETHEUS_IMAGE = 'quay.io/prometheus/prometheus:v2.18.1' DEFAULT_NODE_EXPORTER_IMAGE = 'quay.io/prometheus/node-exporter:v0.18.1' +DEFAULT_LOKI_IMAGE = 'docker.io/grafana/loki:2.4.0' +DEFAULT_PROMTAIL_IMAGE = 'docker.io/grafana/promtail:2.4.0' DEFAULT_ALERT_MANAGER_IMAGE = 'quay.io/prometheus/alertmanager:v0.20.0' DEFAULT_GRAFANA_IMAGE = 'quay.io/ceph/ceph-grafana:6.7.4' DEFAULT_HAPROXY_IMAGE = 'docker.io/library/haproxy:2.3' @@ -197,6 +199,11 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule, default=DEFAULT_NODE_EXPORTER_IMAGE, desc='Prometheus container image', ), + Option( + 'container_image_loki', + default=DEFAULT_LOKI_IMAGE, + desc='Loki container image', + ), Option( 'container_image_haproxy', default=DEFAULT_HAPROXY_IMAGE, @@ -410,6 +417,7 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule, self.container_image_grafana = '' self.container_image_alertmanager = '' self.container_image_node_exporter = '' + self.container_image_loki = '' self.container_image_haproxy = '' self.container_image_keepalived = '' self.container_image_snmp_gateway = '' @@ -505,7 +513,7 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule, _service_clses: Sequence[Type[CephadmService]] = [ OSDService, NFSService, MonService, MgrService, MdsService, RgwService, RbdMirrorService, GrafanaService, AlertmanagerService, - PrometheusService, NodeExporterService, CrashService, IscsiService, + PrometheusService, NodeExporterService, LokiService, CrashService, IscsiService, IngressService, CustomContainerService, CephfsMirrorService, CephadmAgent, SNMPGatewayService ] @@ -657,7 +665,7 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule, suffix = daemon_type not in [ 'mon', 'crash', 'prometheus', 'node-exporter', 'grafana', 'alertmanager', - 'container', 'agent', 'snmp-gateway' + 'container', 'agent', 'snmp-gateway', 'loki' ] if forcename: if len([d for d in existing if d.daemon_id == forcename]): @@ -1293,6 +1301,8 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule, image = self.container_image_alertmanager elif daemon_type == 'node-exporter': image = self.container_image_node_exporter + elif daemon_type == 'loki': + image = self.container_image_loki elif daemon_type == 'haproxy': image = self.container_image_haproxy elif daemon_type == 'keepalived': @@ -2445,6 +2455,7 @@ Then run the following: 'alertmanager': PlacementSpec(count=1), 'prometheus': PlacementSpec(count=1), 'node-exporter': PlacementSpec(host_pattern='*'), + 'loki': PlacementSpec(count=1), 'crash': PlacementSpec(host_pattern='*'), 'container': PlacementSpec(count=1), 'snmp-gateway': PlacementSpec(count=1), @@ -2540,6 +2551,10 @@ Then run the following: def apply_prometheus(self, spec: ServiceSpec) -> str: return self._apply(spec) + @handle_orch_error + def apply_loki(self, spec: ServiceSpec) -> str: + return self._apply(spec) + @handle_orch_error def apply_node_exporter(self, spec: ServiceSpec) -> str: return self._apply(spec) diff --git a/src/pybind/mgr/cephadm/services/monitoring.py b/src/pybind/mgr/cephadm/services/monitoring.py index d67d3057bb8..f88acc9f46a 100644 --- a/src/pybind/mgr/cephadm/services/monitoring.py +++ b/src/pybind/mgr/cephadm/services/monitoring.py @@ -378,6 +378,30 @@ class NodeExporterService(CephadmService): return HandleCommandResult(0, out, '') +class LokiService(CephadmService): + TYPE = 'loki' + + def prepare_create(self, daemon_spec: CephadmDaemonDeploySpec) -> CephadmDaemonDeploySpec: + assert self.TYPE == daemon_spec.daemon_type + daemon_spec.final_config, daemon_spec.deps = self.generate_config(daemon_spec) + return daemon_spec + + def generate_config(self, daemon_spec: CephadmDaemonDeploySpec) -> Tuple[Dict[str, Any], List[str]]: + assert self.TYPE == daemon_spec.daemon_type + deps: List[str] = [] + hostnames: List[str] = [] + for dd in self.mgr.cache.get_daemons_by_service('mgr'): + addr = self.mgr.inventory.get_addr(dd.hostname) + hostnames.append(addr) + + yml = self.mgr.template.render('services/loki.yml.j2') + return { + "files": { + "loki.yml": yml + } + }, sorted(deps) + + class SNMPGatewayService(CephadmService): TYPE = 'snmp-gateway' diff --git a/src/pybind/mgr/cephadm/templates/services/loki.yml.j2 b/src/pybind/mgr/cephadm/templates/services/loki.yml.j2 new file mode 100644 index 00000000000..d1849cb7460 --- /dev/null +++ b/src/pybind/mgr/cephadm/templates/services/loki.yml.j2 @@ -0,0 +1,26 @@ +auth_enabled: false + +server: + http_listen_port: 3100 + +common: + path_prefix: /tmp/loki + storage: + filesystem: + chunks_directory: /tmp/loki/chunks + rules_directory: /tmp/loki/rules + replication_factor: 1 + ring: + instance_addr: 127.0.0.1 + kvstore: + store: inmemory + +schema_config: + configs: + - from: 2020-10-24 + store: boltdb-shipper + object_store: filesystem + schema: v11 + index: + prefix: index_ + period: 24h \ No newline at end of file diff --git a/src/pybind/mgr/cephadm/templates/services/promtail.yml.j2 b/src/pybind/mgr/cephadm/templates/services/promtail.yml.j2 new file mode 100644 index 00000000000..33e566492d3 --- /dev/null +++ b/src/pybind/mgr/cephadm/templates/services/promtail.yml.j2 @@ -0,0 +1,18 @@ +server: + http_listen_port: 9080 + grpc_listen_port: 0 + +positions: + filename: /tmp/positions.yaml + +client: + url: http:localhost:3100/loki/api/v1/push + +scrape_configs: +- job_name: docker + pipeline_stages: + - docker: {} + static_configs: + - labels: + job: docker + __path__: /var/lib/docker/containers//-json.log \ No newline at end of file diff --git a/src/pybind/mgr/orchestrator/_interface.py b/src/pybind/mgr/orchestrator/_interface.py index 8fb45273631..6467e21a959 100644 --- a/src/pybind/mgr/orchestrator/_interface.py +++ b/src/pybind/mgr/orchestrator/_interface.py @@ -466,6 +466,7 @@ class Orchestrator(object): 'node-exporter': self.apply_node_exporter, 'osd': lambda dg: self.apply_drivegroups([dg]), # type: ignore 'prometheus': self.apply_prometheus, + 'loki': self.apply_loki, 'rbd-mirror': self.apply_rbd_mirror, 'rgw': self.apply_rgw, 'ingress': self.apply_ingress, @@ -639,6 +640,10 @@ class Orchestrator(object): def apply_node_exporter(self, spec: ServiceSpec) -> OrchResult[str]: """Update existing a Node-Exporter daemon(s)""" raise NotImplementedError() + + def apply_loki(self, spec: ServiceSpec) -> OrchResult[str]: + """Update existing a Loki daemon(s)""" + raise NotImplementedError() def apply_crash(self, spec: ServiceSpec) -> OrchResult[str]: """Update existing a crash daemon(s)""" @@ -720,6 +725,7 @@ def daemon_type_to_service(dtype: str) -> str: 'alertmanager': 'alertmanager', 'prometheus': 'prometheus', 'node-exporter': 'node-exporter', + 'loki': 'loki', 'crash': 'crash', 'crashcollector': 'crash', # Specific Rook Daemon 'container': 'container', @@ -744,6 +750,7 @@ def service_to_daemon_types(stype: str) -> List[str]: 'grafana': ['grafana'], 'alertmanager': ['alertmanager'], 'prometheus': ['prometheus'], + 'loki': ['loki'], 'node-exporter': ['node-exporter'], 'crash': ['crash'], 'container': ['container'], diff --git a/src/pybind/mgr/orchestrator/module.py b/src/pybind/mgr/orchestrator/module.py index 2d73361fc6b..c0e9e585f91 100644 --- a/src/pybind/mgr/orchestrator/module.py +++ b/src/pybind/mgr/orchestrator/module.py @@ -56,6 +56,7 @@ class ServiceType(enum.Enum): grafana = 'grafana' node_exporter = 'node-exporter' prometheus = 'prometheus' + loki = 'loki' mds = 'mds' rgw = 'rgw' nfs = 'nfs' diff --git a/src/python-common/ceph/deployment/service_spec.py b/src/python-common/ceph/deployment/service_spec.py index 5bf47d39c79..d409b922855 100644 --- a/src/python-common/ceph/deployment/service_spec.py +++ b/src/python-common/ceph/deployment/service_spec.py @@ -437,7 +437,7 @@ class ServiceSpec(object): This structure is supposed to be enough information to start the services. """ - KNOWN_SERVICE_TYPES = 'alertmanager crash grafana iscsi mds mgr mon nfs ' \ + KNOWN_SERVICE_TYPES = 'alertmanager crash grafana iscsi loki mds mgr mon nfs ' \ 'node-exporter osd prometheus rbd-mirror rgw agent ' \ 'container ingress cephfs-mirror snmp-gateway'.split() REQUIRES_SERVICE_ID = 'iscsi mds nfs rgw container ingress '.split() @@ -460,6 +460,7 @@ class ServiceSpec(object): 'grafana': GrafanaSpec, 'node-exporter': MonitoringSpec, 'prometheus': MonitoringSpec, + 'loki': MonitoringSpec, 'snmp-gateway': SNMPGatewaySpec, }.get(service_type, cls) if ret == ServiceSpec and not service_type: @@ -1054,7 +1055,7 @@ class MonitoringSpec(ServiceSpec): preview_only: bool = False, port: Optional[int] = None, ): - assert service_type in ['grafana', 'node-exporter', 'prometheus', 'alertmanager'] + assert service_type in ['grafana', 'node-exporter', 'prometheus', 'alertmanager', 'loki'] super(MonitoringSpec, self).__init__( service_type, service_id, @@ -1075,7 +1076,8 @@ class MonitoringSpec(ServiceSpec): return {'prometheus': 9095, 'node-exporter': 9100, 'alertmanager': 9093, - 'grafana': 3000}[self.service_type] + 'grafana': 3000, + 'loki': 3100}[self.service_type] yaml.add_representer(MonitoringSpec, ServiceSpec.yaml_representer) -- 2.39.5