Since promtail is now deprecated, we need to start using grafana alloy for centralized logging setup
Fixes: https://tracker.ceph.com/issues/71072
Signed-off-by: Aashish Sharma <aasharma@redhat.com>
Centralized Logging in Ceph
~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Ceph now provides centralized logging with Loki & Promtail. Centralized Log Management (CLM) consolidates all log data and pushes it to a central repository,
+Ceph now provides centralized logging with Loki and Alloy. Centralized Log Management (CLM) consolidates all log data and pushes it to a central repository,
with an accessible and easy-to-use interface. Centralized logging is designed to make your life easier.
Some of the advantages are:
#. **Flexible retention policies**: with per-daemon logs, log rotation is usually set to a short interval (1-2 weeks) to save disk usage.
#. **Increased security & backup**: logs can contain sensitive information and expose usage patterns. Additionally, centralized logging allows for HA, etc.
-Centralized Logging in Ceph is implemented using two new services - ``loki`` & ``promtail``.
+Centralized Logging in Ceph is implemented using two services: ``loki`` and ``alloy``.
-Loki: It is basically a log aggregation system and is used to query logs. It can be configured as a datasource in Grafana.
+* Loki is a log aggregation system and is used to query logs. It can be configured as a ``datasource`` in Grafana.
+* Alloy acts as an agent that gathers logs from each node and forwards them to Loki.
-Promtail: It acts as an agent that gathers logs from the system and makes them available to Loki.
-
-These two services are not deployed by default in a Ceph cluster. To enable the centralized logging you can follow the steps mentioned here :ref:`centralized-logging`.
+These two services are not deployed by default in a Ceph cluster. To enable centralized logging you can follow the steps mentioned here :ref:`centralized-logging`.
.. _cephadm-monitoring-networks-ports:
elif daemon_type == 'promtail':
data_dir_root = ident.data_dir(ctx.data_dir)
config_dir = 'etc/promtail'
+ elif daemon_type == 'alloy':
+ data_dir_root = ident.data_dir(ctx.data_dir)
+ config_dir = 'etc/alloy'
makedirs(os.path.join(data_dir_root, config_dir), uid, gid, 0o755)
makedirs(os.path.join(data_dir_root, 'data'), uid, gid, 0o755)
elif daemon_type == 'loki':
'Perhaps the ceph version being bootstrapped does not support it')
if ctx.with_centralized_logging:
- for t in ['loki', 'promtail']:
+ for t in ['loki', 'alloy']:
logger.info('Deploying %s service with default placement...' % t)
try:
cli(['orch', 'apply', t])
parser_bootstrap.add_argument(
'--with-centralized-logging',
action='store_true',
- help='Automatically provision centralized logging (promtail, loki)')
+ help='Automatically provision centralized logging (alloy, loki)')
parser_bootstrap.add_argument(
'--apply-spec',
help='Apply cluster spec after bootstrap (copy ssh key, add hosts and apply services)')
'alertmanager': [9093, 9094],
'loki': [3100],
'promtail': [9080],
+ 'alloy': [9080],
}
components = {
'promtail.yml',
],
},
+ 'alloy': {
+ 'image': DefaultImages.ALLOY.image_ref,
+ 'cpus': '1',
+ 'memory': '1GB',
+ 'args': [
+ 'run',
+ '/etc/alloy/config.alloy',
+ '--storage.path=/var/lib/alloy/data',
+ ],
+ 'config-json-files': ['config.alloy'],
+ },
'node-exporter': {
'image': DefaultImages.NODE_EXPORTER.image_ref,
'cpus': '1',
def get_version(ctx, container_id, daemon_type):
# type: (CephadmContext, str, str) -> str
"""
- :param: daemon_type Either "prometheus", "alertmanager", "loki", "promtail" or "node-exporter"
+ :param: daemon_type Either "prometheus", "alertmanager", "loki", "alloy" or "node-exporter"
"""
assert daemon_type in (
'prometheus',
'node-exporter',
'loki',
'promtail',
+ 'alloy',
)
cmd = daemon_type.replace('-', '_')
code = -1
uid, gid = extract_uid_gid(ctx, file_path='/etc/loki')
elif daemon_type == 'promtail':
uid, gid = extract_uid_gid(ctx, file_path='/etc/promtail')
+ elif daemon_type == 'alloy':
+ uid, gid = extract_uid_gid(ctx, file_path='/etc/alloy')
elif daemon_type == 'alertmanager':
uid, gid = extract_uid_gid(
ctx, file_path=['/etc/alertmanager', '/etc/prometheus']
metadata = self.components[daemon_type]
r = list(metadata.get('args', []))
# set ip and port to bind to for nodeexporter,alertmanager,prometheus
- if daemon_type not in ['grafana', 'loki', 'promtail']:
+ if daemon_type not in ['grafana', 'loki', 'promtail', 'alloy']:
ip = ''
port = self.port_map[daemon_type][0]
meta = fetch_meta(ctx)
mounts[os.path.join(data_dir, 'etc/promtail')] = '/etc/promtail:Z'
mounts[log_dir] = '/var/log/ceph:z'
mounts[os.path.join(data_dir, 'data')] = '/promtail:Z'
+ elif daemon_type == 'alloy':
+ mounts[os.path.join(data_dir, 'etc/alloy')] = '/etc/alloy:Z'
+ mounts[log_dir] = '/var/log/ceph:z'
+ mounts[os.path.join(data_dir, 'data')] = '/var/lib/alloy/data:Z'
elif daemon_type == 'node-exporter':
mounts[
os.path.join(data_dir, 'etc/node-exporter')
# by ubuntu 18.04 kernel!)
]
args.extend(monitoring_args)
+ if self.identity.daemon_type == 'alloy':
+ args.extend(['--user=root'])
if self.identity.daemon_type == 'node-exporter':
# in order to support setting '--path.procfs=/host/proc','--path.sysfs=/host/sys',
# '--path.rootfs=/rootfs' for node-exporter we need to disable selinux separation
'node-exporter',
'loki',
'promtail',
+ 'alloy',
]:
version = Monitoring.get_version(
ctx, container_id, daemon_type
if TYPE_CHECKING:
from .module import CephadmOrchestrator
-LAST_MIGRATION = 8
+LAST_MIGRATION = 9
logger = logging.getLogger(__name__)
"cephadm migration still ongoing. Please wait, until the migration is complete.")
def migrate(self, startup: bool = False) -> None:
-
logger.info('running migrations')
if self.mgr.migration_current == 0:
if self.migrate_7_8():
self.set(8)
+ if self.mgr.migration_current == 8:
+ if self.migrate_8_9():
+ self.set(9)
+
def migrate_0_1(self) -> bool:
"""
Migration 0 -> 1
self.rgw_ssl_migration_queue = []
return True
+ def migrate_8_9(self) -> bool:
+ """
+ Replace Promtail with Alloy.
+
+ - If mgr daemons are still being upgraded, return True WITHOUT bumping migration_current.
+ - Mark Promtail service unmanaged so cephadm won't redeploy it.
+ - Remove Promtail daemons to free ports.
+ - Deploy Alloy with Promtail's placement.
+ - Once Alloy is confirmed deployed, remove Promtail service spec.
+ """
+ try:
+ target_digests = getattr(self.mgr.upgrade.upgrade_state, "target_digests", [])
+ active_mgr_digests = self.mgr.get_active_mgr_digests()
+
+ if target_digests:
+ if not any(d in target_digests for d in active_mgr_digests):
+ logger.info(
+ "Promtail -> Alloy migration: mgr daemons still upgrading. "
+ "Marking as complete without bumping migration_current."
+ )
+ return False
+
+ promtail_spec = self.mgr.spec_store.active_specs.get("promtail")
+ if not promtail_spec:
+ logger.info("Promtail -> Alloy migration: no Promtail \
+ service found, nothing to do.")
+ return True
+
+ if not promtail_spec.unmanaged:
+ logger.info("Promtail -> Alloy migration: marking promtail unmanaged")
+ self.mgr.spec_store.set_unmanaged("promtail", True)
+
+ daemons = self.mgr.cache.get_daemons()
+ promtail_daemons = [d for d in daemons if d.daemon_type == "promtail"]
+ if promtail_daemons:
+ promtail_names = [d.name() for d in promtail_daemons]
+ logger.info(f"Promtail -> Alloy migration: removing daemons {promtail_names}")
+ self.mgr.remove_daemons(promtail_names)
+
+ daemons = self.mgr.cache.get_daemons()
+ if any(d.daemon_type == "promtail" for d in daemons):
+ logger.info(
+ "Promtail -> Alloy migration: promtail daemons still present, "
+ "skipping Alloy deployment until next run."
+ )
+ return False
+
+ alloy_spec = ServiceSpec(
+ service_type="alloy",
+ service_id="alloy",
+ placement=promtail_spec.placement
+ )
+
+ logger.info("Promtail -> Alloy migration: deploying Alloy service")
+ self.mgr.apply_alloy(alloy_spec)
+
+ logger.info("Promtail -> Alloy migration: removing promtail service spec")
+ self.mgr.remove_service("promtail")
+
+ logger.info("Promtail -> Alloy migration completed successfully.")
+ return True
+
+ except Exception as e:
+ logger.error(f"Promtail -> Alloy migration failed: {e}")
+ return False
+
def queue_migrate_rgw_spec(mgr: "CephadmOrchestrator", spec_dict: Dict[Any, Any]) -> None:
"""
self.container_image_node_exporter = ''
self.container_image_loki = ''
self.container_image_promtail = ''
+ self.container_image_alloy = ''
self.container_image_haproxy = ''
self.container_image_keepalived = ''
self.container_image_snmp_gateway = ''
suffix = daemon_type not in [
'mon', 'crash', 'ceph-exporter', 'node-proxy',
'prometheus', 'node-exporter', 'grafana', 'alertmanager',
- 'container', 'agent', 'snmp-gateway', 'loki', 'promtail',
+ 'container', 'agent', 'snmp-gateway', 'loki', 'promtail', 'alloy',
'elasticsearch', 'jaeger-collector', 'jaeger-agent', 'jaeger-query', 'mgmt-gateway', 'oauth2-proxy'
]
if forcename:
'nvmeof': self.container_image_nvmeof,
'prometheus': self.container_image_prometheus,
'promtail': self.container_image_promtail,
+ 'alloy': self.container_image_alloy,
'snmp-gateway': self.container_image_snmp_gateway,
'mgmt-gateway': self.container_image_nginx,
'oauth2-proxy': self.container_image_oauth2_proxy,
'ceph-exporter': PlacementSpec(host_pattern='*'),
'loki': PlacementSpec(count=1),
'promtail': PlacementSpec(host_pattern='*'),
+ 'alloy': PlacementSpec(host_pattern='*'),
'crash': PlacementSpec(host_pattern='*'),
'container': PlacementSpec(count=1),
'snmp-gateway': PlacementSpec(count=1),
def apply_promtail(self, spec: ServiceSpec) -> str:
return self._apply(spec)
+ @handle_orch_error
+ def apply_alloy(self, spec: ServiceSpec) -> str:
+ return self._apply(spec)
+
@handle_orch_error
def apply_node_exporter(self, spec: ServiceSpec) -> str:
return self._apply(spec)
}, sorted(deps)
+@register_cephadm_service
+class AlloyService(CephadmService):
+ TYPE = 'alloy'
+ DEFAULT_SERVICE_PORT = 9080
+
+ @classmethod
+ def get_dependencies(cls, mgr: "CephadmOrchestrator",
+ spec: Optional[ServiceSpec] = None,
+ daemon_type: Optional[str] = None) -> List[str]:
+ return sorted(mgr.cache.get_daemons_by_types(['loki']))
+
+ def prepare_create(self, daemon_spec: CephadmDaemonDeploySpec) -> CephadmDaemonDeploySpec:
+ assert self.TYPE == daemon_spec.daemon_type
+ daemon_spec.final_config, daemon_spec.deps = self.generate_config(daemon_spec)
+ return daemon_spec
+
+ def generate_config(self, daemon_spec: CephadmDaemonDeploySpec) -> Tuple[Dict[str, Any], List[str]]:
+ assert self.TYPE == daemon_spec.daemon_type
+ daemons = self.mgr.cache.get_daemons_by_service('loki')
+ loki_host = ''
+ for i, dd in enumerate(daemons):
+ assert dd.hostname is not None
+ if i == 0:
+ loki_host = dd.ip if dd.ip else self.mgr.get_fqdn(dd.hostname)
+
+ context = {
+ 'client_hostname': loki_host,
+ }
+
+ alloy_config = self.mgr.template.render('services/alloy.j2', context)
+ return {
+ "files": {
+ "config.alloy": alloy_config
+ }
+ }, self.get_dependencies(self.mgr)
+
+
@register_cephadm_service
class PromtailService(CephadmService):
TYPE = 'promtail'
--- /dev/null
+local.file_match "system" {
+ path_targets = [{
+ __address__ = "localhost",
+ __path__ = "/var/log/ceph/**/*.log",
+ job = "Cluster Logs",
+ }]
+}
+
+loki.source.file "system" {
+ targets = local.file_match.system.targets
+ forward_to = [loki.write.default.receiver]
+ legacy_positions_file = "/var/lib/alloy/data/positions.yaml"
+}
+
+loki.write "default" {
+ endpoint {
+ url = "http://{{ client_hostname }}:3100/loki/api/v1/push"
+ }
+ external_labels = {}
+}
# services based on CephadmService shouldn't have get_auth_entity
with pytest.raises(AttributeError):
- for daemon_type in ['grafana', 'alertmanager', 'prometheus', 'node-exporter', 'loki', 'promtail']:
+ for daemon_type in ['grafana', 'alertmanager', 'prometheus', 'node-exporter', 'loki', 'promtail', 'alloy']:
service_registry.get_service(daemon_type).get_auth_entity("id1", "host")
service_registry.get_service(daemon_type).get_auth_entity("id1", "")
service_registry.get_service(daemon_type).get_auth_entity("id1")
'rbd-mirror', 'cephfs-mirror', 'ceph-exporter']
GATEWAY_TYPES = ['iscsi', 'nfs', 'nvmeof', 'smb']
MONITORING_STACK_TYPES = ['node-exporter', 'prometheus',
- 'alertmanager', 'grafana', 'loki', 'promtail']
+ 'alertmanager', 'grafana', 'loki', 'promtail', 'alloy']
RESCHEDULE_FROM_OFFLINE_HOSTS_TYPES = ['haproxy', 'nfs']
CEPH_UPGRADE_ORDER = CEPH_TYPES + GATEWAY_TYPES + MONITORING_STACK_TYPES
i18n>Daemon Logs</a>
<ng-template ngbNavContent>
<ng-container *ngIf="showDaemonLogs && lokiServiceStatus$ | async as lokiServiceStatus ; else daemonLogsTpl ">
- <div *ngIf="promtailServiceStatus$ | async as promtailServiceStatus; else daemonLogsTpl">
+ <div *ngIf="alloyServiceStatus$ | async as alloyServiceStatus; else daemonLogsTpl">
<cd-grafana i18n-title
title="Daemon logs"
[grafanaPath]="'explore?'"
<ng-template #daemonLogsTpl>
<cd-alert-panel type="info"
- title="Loki/Promtail service not running"
+ title="Loki/Alloy service not running"
i18n-title>
- <ng-container i18n>Please start the loki and promtail service to see these logs.</ng-container>
+ <ng-container i18n>Please start the loki and alloy services to see these logs.</ng-container>
</cd-alert-panel>
</ng-template>
clogText: string;
auditLogText: string;
lokiServiceStatus$: Observable<boolean>;
- promtailServiceStatus$: Observable<boolean>;
+ alloyServiceStatus$: Observable<boolean>;
interval: number;
priorities: Array<{ name: string; value: string }> = [
return data.length > 0 && data[0].status === 1;
})
);
- this.promtailServiceStatus$ = this.cephService.getDaemons('promtail').pipe(
+ this.alloyServiceStatus$ = this.cephService.getDaemons('alloy').pipe(
map((data: any) => {
return data.length > 0 && data[0].status === 1;
})
// Remove service types:
// osd - This is deployed a different way.
// container - This should only be used in the CLI.
- this.hiddenServices.push('osd', 'container');
+ // promtail - This is deprecated and replaced by alloy.
+ this.hiddenServices.push('osd', 'container', 'promtail');
this.serviceTypes = _.difference(resp, this.hiddenServices).sort();
});
'prometheus': self.apply_prometheus,
'loki': self.apply_loki,
'promtail': self.apply_promtail,
+ 'alloy': self.apply_alloy,
'rbd-mirror': self.apply_rbd_mirror,
'rgw': self.apply_rgw,
'ingress': self.apply_ingress,
"""Update existing a Promtail daemon(s)"""
raise NotImplementedError()
+ def apply_alloy(self, spec: ServiceSpec) -> OrchResult[str]:
+ """Update existing a alloy daemon(s)"""
+ raise NotImplementedError()
+
def apply_crash(self, spec: ServiceSpec) -> OrchResult[str]:
"""Update existing a crash daemon(s)"""
raise NotImplementedError()
'ceph-exporter': 'ceph-exporter',
'loki': 'loki',
'promtail': 'promtail',
+ 'alloy': 'alloy',
'crash': 'crash',
'crashcollector': 'crash', # Specific Rook Daemon
'container': 'container',
'prometheus': ['prometheus'],
'loki': ['loki'],
'promtail': ['promtail'],
+ 'alloy': ['alloy'],
'node-exporter': ['node-exporter'],
'ceph-exporter': ['ceph-exporter'],
'crash': ['crash'],
prometheus = 'prometheus'
loki = 'loki'
promtail = 'promtail'
+ alloy = 'alloy'
mds = 'mds'
rgw = 'rgw'
nfs = 'nfs'
PROMETHEUS = _create_image('quay.io/prometheus/prometheus:v2.51.0', 'prometheus')
LOKI = _create_image('docker.io/grafana/loki:3.0.0', 'loki')
PROMTAIL = _create_image('docker.io/grafana/promtail:3.0.0', 'promtail')
+ ALLOY = _create_image('docker.io/grafana/alloy:latest', 'alloy')
NODE_EXPORTER = _create_image('quay.io/prometheus/node-exporter:v1.7.0', 'node_exporter')
ALERTMANAGER = _create_image('quay.io/prometheus/alertmanager:v0.27.0', 'alertmanager')
GRAFANA = _create_image('quay.io/ceph/grafana:11.6.0', 'grafana')
class NonCephImageServiceTypes(Enum):
prometheus = 'prometheus'
loki = 'loki'
- promtail = 'promtail'
+ alloy = 'alloy'
node_exporter = 'node-exporter'
alertmanager = 'alertmanager'
grafana = 'grafana'
+ promtail = 'promtail'
nvmeof = 'nvmeof'
snmp_gateway = 'snmp-gateway'
elasticsearch = 'elasticsearch'
'osd',
'prometheus',
'promtail',
+ 'alloy',
'rbd-mirror',
'rgw',
'smb',
'prometheus': PrometheusSpec,
'loki': MonitoringSpec,
'promtail': MonitoringSpec,
+ 'alloy': MonitoringSpec,
'snmp-gateway': SNMPGatewaySpec,
'elasticsearch': TracingSpec,
'jaeger-agent': TracingSpec,
custom_configs: Optional[List[CustomConfig]] = None,
):
assert service_type in ['grafana', 'node-exporter', 'prometheus', 'alertmanager',
- 'loki', 'promtail']
+ 'loki', 'alloy', 'promtail']
super(MonitoringSpec, self).__init__(
service_type, service_id,
'alertmanager': 9093,
'grafana': 3000,
'loki': 3100,
+ 'alloy': 9080,
'promtail': 9080}[self.service_type]