From: Avan Thakkar Date: Mon, 24 Jan 2022 22:37:45 +0000 (+0530) Subject: mgr/cephadm: introducing orch apply daemon loki X-Git-Tag: v17.2.1~73^2~18 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=dbe8d0716c5602748a09f5c6b5257b91982ab10d;p=ceph.git mgr/cephadm: introducing orch apply daemon loki Signed-off-by: Avan Thakkar (cherry picked from commit 5dc96cb62165109338e6e13a259e5e6dc2b5a8ce) Conflicts: src/cephadm/cephadm src/pybind/mgr/cephadm/module.py --- diff --git a/src/cephadm/cephadm b/src/cephadm/cephadm index d87cb1d0026a..0ea604c35dd6 100755 --- a/src/cephadm/cephadm +++ b/src/cephadm/cephadm @@ -50,8 +50,8 @@ DEFAULT_IMAGE_IS_MASTER = False DEFAULT_IMAGE_RELEASE = 'quincy' DEFAULT_PROMETHEUS_IMAGE = 'quay.io/prometheus/prometheus:v2.33.4' DEFAULT_CADVISOR_IMAGE = 'gcr.io/cadvisor/cadvisor:v0.39.3' -DEFAULT_LOKI_IMAGE = 'docker.io/r/grafana/loki:2.4.0' -DEFAULT_PROMTAIL_IMAGE = 'docker.io/r/grafana/promtail:2.4.0' +DEFAULT_LOKI_IMAGE = 'docker.io/grafana/loki:2.4.0' +DEFAULT_PROMTAIL_IMAGE = 'docker.io/grafana/promtail:2.4.0' DEFAULT_NODE_EXPORTER_IMAGE = 'quay.io/prometheus/node-exporter:v1.3.1' DEFAULT_ALERT_MANAGER_IMAGE = 'quay.io/prometheus/alertmanager:v0.23.0' DEFAULT_GRAFANA_IMAGE = 'quay.io/ceph/ceph-grafana:8.3.5' @@ -612,7 +612,7 @@ class Monitoring(object): """ :param: daemon_type Either "prometheus", "alertmanager" or "node-exporter" """ - assert daemon_type in ('prometheus', 'alertmanager', 'node-exporter') + assert daemon_type in ('prometheus', 'alertmanager', 'node-exporter', 'loki') cmd = daemon_type.replace('-', '_') code = -1 err = '' @@ -2540,7 +2540,7 @@ def get_daemon_args(ctx, fsid, daemon_type, daemon_id): metadata = Monitoring.components[daemon_type] r += metadata.get('args', list()) # set ip and port to bind to for nodeexporter,alertmanager,prometheus - if daemon_type != 'grafana': + if daemon_type != 'grafana' and daemon_type != 'loki': ip = '' port = Monitoring.port_map[daemon_type][0] if 'meta_json' in ctx and ctx.meta_json: @@ -2557,6 +2557,8 @@ def get_daemon_args(ctx, fsid, daemon_type, daemon_id): r += ['--cluster.peer={}'.format(peer)] # some alertmanager, by default, look elsewhere for a config r += ['--config.file=/etc/alertmanager/alertmanager.yml'] + if daemon_type == 'loki': + r += ['--config.file=/etc/loki/loki.yml'] elif daemon_type == NFSGanesha.daemon_type: nfs_ganesha = NFSGanesha.init(ctx, fsid, daemon_id) r += nfs_ganesha.get_daemon_args() @@ -2637,6 +2639,7 @@ def create_daemon_dirs(ctx, fsid, daemon_type, daemon_id, uid, gid, daemon_type, daemon_id) config_dir = 'etc/loki' makedirs(os.path.join(data_dir_root, config_dir), uid, gid, 0o755) + makedirs(os.path.join(data_dir_root, 'data'), uid, gid, 0o755) # populate the config directory for the component from the config-json if 'files' in config_json: @@ -2832,8 +2835,10 @@ def get_container_mounts(ctx, fsid, daemon_type, daemon_id, mounts['/var/lib/docker'] = '/var/lib/docker:ro' elif daemon_type == 'loki': mounts[os.path.join(data_dir, 'etc/loki')] = '/etc/loki:Z' + mounts[os.path.join(data_dir, 'data')] = '/loki:Z' elif daemon_type == 'promtail': mounts[os.path.join(data_dir, 'etc/promtail')] = '/etc/promtail:Z' + mounts[os.path.join(data_dir, 'data')] = '/promtail:Z' elif daemon_type == 'node-exporter': mounts['/proc'] = '/host/proc:ro' mounts['/sys'] = '/host/sys:ro' @@ -5036,7 +5041,7 @@ def prepare_ssh( cli(['orch', 'apply', 'crash']) if not ctx.skip_monitoring_stack: - for t in ['prometheus', 'grafana', 'node-exporter', 'alertmanager']: + for t in ['prometheus', 'grafana', 'node-exporter', 'alertmanager', 'loki']: logger.info('Deploying %s service with default placement...' % t) cli(['orch', 'apply', t]) @@ -5600,6 +5605,10 @@ def extract_uid_gid_monitoring(ctx, daemon_type): uid, gid = 65534, 65534 elif daemon_type == 'grafana': uid, gid = extract_uid_gid(ctx, file_path='/var/lib/grafana') + elif daemon_type == 'loki': + uid, gid = extract_uid_gid(ctx, file_path='/etc/loki') + elif daemon_type == 'promtail': + uid, gid = extract_uid_gid(ctx, file_path='/etc/promtail') elif daemon_type == 'alertmanager': uid, gid = extract_uid_gid(ctx, file_path=['/etc/alertmanager', '/etc/prometheus']) else: @@ -6250,7 +6259,8 @@ def list_daemons(ctx, detail=True, legacy_dir=None): seen_versions[image_id] = version elif daemon_type in ['prometheus', 'alertmanager', - 'node-exporter']: + 'node-exporter', + 'loki']: version = Monitoring.get_version(ctx, container_id, daemon_type) seen_versions[image_id] = version elif daemon_type == 'haproxy': @@ -6402,6 +6412,8 @@ def command_adopt(ctx): command_adopt_ceph(ctx, daemon_type, daemon_id, fsid) elif daemon_type == 'prometheus': command_adopt_prometheus(ctx, daemon_id, fsid) + elif daemon_type == 'loki': + command_adopt_loki(ctx, daemon_id, fsid) elif daemon_type == 'grafana': command_adopt_grafana(ctx, daemon_id, fsid) elif daemon_type == 'node-exporter': @@ -6637,6 +6649,35 @@ def command_adopt_prometheus(ctx, daemon_id, fsid): update_firewalld(ctx, daemon_type) +def command_adopt_loki(ctx, daemon_id, fsid): + # type: (CephadmContext, str, str) -> None + daemon_type = 'loki' + (uid, gid) = extract_uid_gid_monitoring(ctx, daemon_type) + + _stop_and_disable(ctx, 'loki') + + data_dir_dst = make_data_dir(ctx, fsid, daemon_type, daemon_id, + uid=uid, gid=gid) + + # config + config_src = '/etc/loki/loki.yml' + config_src = os.path.abspath(ctx.legacy_dir + config_src) + config_dst = os.path.join(data_dir_dst, 'etc/loki') + makedirs(config_dst, uid, gid, 0o755) + copy_files(ctx, [config_src], config_dst, uid=uid, gid=gid) + + # data + data_src = '/var/lib/loki' + data_src = os.path.abspath(ctx.legacy_dir + data_src) + data_dst = os.path.join(data_dir_dst, 'data') + copy_tree(ctx, [data_src], data_dst, uid=uid, gid=gid) + + make_var_run(ctx, fsid, uid, gid) + c = get_container(ctx, fsid, daemon_type, daemon_id) + deploy_daemon(ctx, fsid, daemon_type, daemon_id, c, uid, gid) + update_firewalld(ctx, daemon_type) + + def command_adopt_grafana(ctx, daemon_id, fsid): # type: (CephadmContext, str, str) -> None @@ -8736,7 +8777,7 @@ def _get_parser(): parser_bootstrap.add_argument( '--skip-monitoring-stack', action='store_true', - help='Do not automatically provision monitoring stack (prometheus, grafana, alertmanager, node-exporter)') + help='Do not automatically provision monitoring stack (prometheus, grafana, alertmanager, node-exporter, loki)') parser_bootstrap.add_argument( '--apply-spec', help='Apply cluster spec after bootstrap (copy ssh key, add hosts and apply services)') diff --git a/src/pybind/mgr/cephadm/module.py b/src/pybind/mgr/cephadm/module.py index d5777a6876da..3355a701cfc2 100644 --- a/src/pybind/mgr/cephadm/module.py +++ b/src/pybind/mgr/cephadm/module.py @@ -52,7 +52,7 @@ from .services.iscsi import IscsiService from .services.nfs import NFSService from .services.osd import OSDRemovalQueue, OSDService, OSD, NotFoundError from .services.monitoring import GrafanaService, AlertmanagerService, PrometheusService, \ - NodeExporterService, SNMPGatewayService + NodeExporterService, SNMPGatewayService, LokiService from .schedule import HostAssignment from .inventory import Inventory, SpecStore, HostCache, AgentCache, EventStore, \ ClientKeyringStore, ClientKeyringSpec @@ -93,6 +93,8 @@ os._exit = os_exit_noop # type: ignore DEFAULT_IMAGE = 'quay.io/ceph/ceph' DEFAULT_PROMETHEUS_IMAGE = 'quay.io/prometheus/prometheus:v2.33.4' DEFAULT_NODE_EXPORTER_IMAGE = 'quay.io/prometheus/node-exporter:v1.3.1' +DEFAULT_LOKI_IMAGE = 'docker.io/grafana/loki:2.4.0' +DEFAULT_PROMTAIL_IMAGE = 'docker.io/grafana/promtail:2.4.0' DEFAULT_ALERT_MANAGER_IMAGE = 'quay.io/prometheus/alertmanager:v0.23.0' DEFAULT_GRAFANA_IMAGE = 'quay.io/ceph/ceph-grafana:8.3.5' DEFAULT_HAPROXY_IMAGE = 'docker.io/library/haproxy:2.3' @@ -197,6 +199,11 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule, default=DEFAULT_NODE_EXPORTER_IMAGE, desc='Prometheus container image', ), + Option( + 'container_image_loki', + default=DEFAULT_LOKI_IMAGE, + desc='Loki container image', + ), Option( 'container_image_haproxy', default=DEFAULT_HAPROXY_IMAGE, @@ -410,6 +417,7 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule, self.container_image_grafana = '' self.container_image_alertmanager = '' self.container_image_node_exporter = '' + self.container_image_loki = '' self.container_image_haproxy = '' self.container_image_keepalived = '' self.container_image_snmp_gateway = '' @@ -505,7 +513,7 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule, _service_clses: Sequence[Type[CephadmService]] = [ OSDService, NFSService, MonService, MgrService, MdsService, RgwService, RbdMirrorService, GrafanaService, AlertmanagerService, - PrometheusService, NodeExporterService, CrashService, IscsiService, + PrometheusService, NodeExporterService, LokiService, CrashService, IscsiService, IngressService, CustomContainerService, CephfsMirrorService, CephadmAgent, SNMPGatewayService ] @@ -657,7 +665,7 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule, suffix = daemon_type not in [ 'mon', 'crash', 'prometheus', 'node-exporter', 'grafana', 'alertmanager', - 'container', 'agent', 'snmp-gateway' + 'container', 'agent', 'snmp-gateway', 'loki' ] if forcename: if len([d for d in existing if d.daemon_id == forcename]): @@ -1296,6 +1304,8 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule, image = self.container_image_alertmanager elif daemon_type == 'node-exporter': image = self.container_image_node_exporter + elif daemon_type == 'loki': + image = self.container_image_loki elif daemon_type == 'haproxy': image = self.container_image_haproxy elif daemon_type == 'keepalived': @@ -2468,6 +2478,7 @@ Then run the following: 'alertmanager': PlacementSpec(count=1), 'prometheus': PlacementSpec(count=1), 'node-exporter': PlacementSpec(host_pattern='*'), + 'loki': PlacementSpec(count=1), 'crash': PlacementSpec(host_pattern='*'), 'container': PlacementSpec(count=1), 'snmp-gateway': PlacementSpec(count=1), @@ -2563,6 +2574,10 @@ Then run the following: def apply_prometheus(self, spec: ServiceSpec) -> str: return self._apply(spec) + @handle_orch_error + def apply_loki(self, spec: ServiceSpec) -> str: + return self._apply(spec) + @handle_orch_error def apply_node_exporter(self, spec: ServiceSpec) -> str: return self._apply(spec) diff --git a/src/pybind/mgr/cephadm/services/monitoring.py b/src/pybind/mgr/cephadm/services/monitoring.py index 239c24bab096..b2174be1d428 100644 --- a/src/pybind/mgr/cephadm/services/monitoring.py +++ b/src/pybind/mgr/cephadm/services/monitoring.py @@ -390,6 +390,30 @@ class NodeExporterService(CephadmService): return HandleCommandResult(0, out, '') +class LokiService(CephadmService): + TYPE = 'loki' + + def prepare_create(self, daemon_spec: CephadmDaemonDeploySpec) -> CephadmDaemonDeploySpec: + assert self.TYPE == daemon_spec.daemon_type + daemon_spec.final_config, daemon_spec.deps = self.generate_config(daemon_spec) + return daemon_spec + + def generate_config(self, daemon_spec: CephadmDaemonDeploySpec) -> Tuple[Dict[str, Any], List[str]]: + assert self.TYPE == daemon_spec.daemon_type + deps: List[str] = [] + hostnames: List[str] = [] + for dd in self.mgr.cache.get_daemons_by_service('mgr'): + addr = self.mgr.inventory.get_addr(dd.hostname) + hostnames.append(addr) + + yml = self.mgr.template.render('services/loki.yml.j2') + return { + "files": { + "loki.yml": yml + } + }, sorted(deps) + + class SNMPGatewayService(CephadmService): TYPE = 'snmp-gateway' diff --git a/src/pybind/mgr/cephadm/templates/services/loki.yml.j2 b/src/pybind/mgr/cephadm/templates/services/loki.yml.j2 new file mode 100644 index 000000000000..d1849cb74602 --- /dev/null +++ b/src/pybind/mgr/cephadm/templates/services/loki.yml.j2 @@ -0,0 +1,26 @@ +auth_enabled: false + +server: + http_listen_port: 3100 + +common: + path_prefix: /tmp/loki + storage: + filesystem: + chunks_directory: /tmp/loki/chunks + rules_directory: /tmp/loki/rules + replication_factor: 1 + ring: + instance_addr: 127.0.0.1 + kvstore: + store: inmemory + +schema_config: + configs: + - from: 2020-10-24 + store: boltdb-shipper + object_store: filesystem + schema: v11 + index: + prefix: index_ + period: 24h \ No newline at end of file diff --git a/src/pybind/mgr/cephadm/templates/services/promtail.yml.j2 b/src/pybind/mgr/cephadm/templates/services/promtail.yml.j2 new file mode 100644 index 000000000000..33e566492d3f --- /dev/null +++ b/src/pybind/mgr/cephadm/templates/services/promtail.yml.j2 @@ -0,0 +1,18 @@ +server: + http_listen_port: 9080 + grpc_listen_port: 0 + +positions: + filename: /tmp/positions.yaml + +client: + url: http:localhost:3100/loki/api/v1/push + +scrape_configs: +- job_name: docker + pipeline_stages: + - docker: {} + static_configs: + - labels: + job: docker + __path__: /var/lib/docker/containers//-json.log \ No newline at end of file diff --git a/src/pybind/mgr/orchestrator/_interface.py b/src/pybind/mgr/orchestrator/_interface.py index 912040f9e16c..fb7e87f6704d 100644 --- a/src/pybind/mgr/orchestrator/_interface.py +++ b/src/pybind/mgr/orchestrator/_interface.py @@ -466,6 +466,7 @@ class Orchestrator(object): 'node-exporter': self.apply_node_exporter, 'osd': lambda dg: self.apply_drivegroups([dg]), # type: ignore 'prometheus': self.apply_prometheus, + 'loki': self.apply_loki, 'rbd-mirror': self.apply_rbd_mirror, 'rgw': self.apply_rgw, 'ingress': self.apply_ingress, @@ -641,6 +642,10 @@ class Orchestrator(object): def apply_node_exporter(self, spec: ServiceSpec) -> OrchResult[str]: """Update existing a Node-Exporter daemon(s)""" raise NotImplementedError() + + def apply_loki(self, spec: ServiceSpec) -> OrchResult[str]: + """Update existing a Loki daemon(s)""" + raise NotImplementedError() def apply_crash(self, spec: ServiceSpec) -> OrchResult[str]: """Update existing a crash daemon(s)""" @@ -722,6 +727,7 @@ def daemon_type_to_service(dtype: str) -> str: 'alertmanager': 'alertmanager', 'prometheus': 'prometheus', 'node-exporter': 'node-exporter', + 'loki': 'loki', 'crash': 'crash', 'crashcollector': 'crash', # Specific Rook Daemon 'container': 'container', @@ -746,6 +752,7 @@ def service_to_daemon_types(stype: str) -> List[str]: 'grafana': ['grafana'], 'alertmanager': ['alertmanager'], 'prometheus': ['prometheus'], + 'loki': ['loki'], 'node-exporter': ['node-exporter'], 'crash': ['crash'], 'container': ['container'], diff --git a/src/pybind/mgr/orchestrator/module.py b/src/pybind/mgr/orchestrator/module.py index c7631df74929..ef7ab0345074 100644 --- a/src/pybind/mgr/orchestrator/module.py +++ b/src/pybind/mgr/orchestrator/module.py @@ -57,6 +57,7 @@ class ServiceType(enum.Enum): grafana = 'grafana' node_exporter = 'node-exporter' prometheus = 'prometheus' + loki = 'loki' mds = 'mds' rgw = 'rgw' nfs = 'nfs' diff --git a/src/python-common/ceph/deployment/service_spec.py b/src/python-common/ceph/deployment/service_spec.py index 854b1ae4acf3..544b9ac93f00 100644 --- a/src/python-common/ceph/deployment/service_spec.py +++ b/src/python-common/ceph/deployment/service_spec.py @@ -443,7 +443,7 @@ class ServiceSpec(object): This structure is supposed to be enough information to start the services. """ - KNOWN_SERVICE_TYPES = 'alertmanager crash grafana iscsi mds mgr mon nfs ' \ + KNOWN_SERVICE_TYPES = 'alertmanager crash grafana iscsi loki mds mgr mon nfs ' \ 'node-exporter osd prometheus rbd-mirror rgw agent ' \ 'container ingress cephfs-mirror snmp-gateway'.split() REQUIRES_SERVICE_ID = 'iscsi mds nfs rgw container ingress '.split() @@ -467,6 +467,7 @@ class ServiceSpec(object): 'grafana': GrafanaSpec, 'node-exporter': MonitoringSpec, 'prometheus': MonitoringSpec, + 'loki': MonitoringSpec, 'snmp-gateway': SNMPGatewaySpec, }.get(service_type, cls) if ret == ServiceSpec and not service_type: @@ -1069,7 +1070,7 @@ class MonitoringSpec(ServiceSpec): port: Optional[int] = None, extra_container_args: Optional[List[str]] = None, ): - assert service_type in ['grafana', 'node-exporter', 'prometheus', 'alertmanager'] + assert service_type in ['grafana', 'node-exporter', 'prometheus', 'alertmanager', 'loki'] super(MonitoringSpec, self).__init__( service_type, service_id, @@ -1090,7 +1091,8 @@ class MonitoringSpec(ServiceSpec): return {'prometheus': 9095, 'node-exporter': 9100, 'alertmanager': 9093, - 'grafana': 3000}[self.service_type] + 'grafana': 3000, + 'loki': 3100}[self.service_type] yaml.add_representer(MonitoringSpec, ServiceSpec.yaml_representer)