]> git-server-git.apps.pok.os.sepia.ceph.com Git - xfsprogs-dev.git/commitdiff
xfs_healer: create a per-mount background monitoring service
authorDarrick J. Wong <djwong@kernel.org>
Thu, 5 Mar 2026 21:26:27 +0000 (13:26 -0800)
committerDarrick J. Wong <djwong@kernel.org>
Thu, 9 Apr 2026 22:30:17 +0000 (15:30 -0700)
Create a systemd service definition for our self-healing filesystem
daemon so that we can run it for every mounted filesystem.  Add a
hidden switch so that we can print the service unit name for fstests.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
healer/Makefile
healer/system-xfs_healer.slice [new file with mode: 0644]
healer/xfs_healer.c
healer/xfs_healer.h
healer/xfs_healer@.service.in [new file with mode: 0644]

index 981192b81af6266116d40f468652665e6c4af2ae..ee44aaee4612505639156b2ee5f2d695589438ee 100644 (file)
@@ -22,7 +22,23 @@ LLDLIBS += $(LIBHANDLE) $(LIBFROG) $(LIBURCU) $(LIBPTHREAD)
 LTDEPENDENCIES += $(LIBHANDLE) $(LIBFROG)
 LLDFLAGS = -static
 
-default: depend $(LTCOMMAND)
+XFS_HEALER_SVCNAME=xfs_healer@.service
+CFLAGS += -DXFS_HEALER_SVCNAME=\"$(XFS_HEALER_SVCNAME)\"
+
+ifeq ($(HAVE_SYSTEMD),yes)
+INSTALL_HEALER += install-systemd
+SYSTEMD_SERVICES=\
+       system-xfs_healer.slice \
+       $(XFS_HEALER_SVCNAME)
+OPTIONAL_TARGETS += $(SYSTEMD_SERVICES)
+endif
+
+default: depend $(LTCOMMAND) $(SYSTEMD_SERVICES)
+
+%.service: %.service.in $(builddefs)
+       @echo "    [SED]    $@"
+       $(Q)$(SED) -e "s|@pkg_libexec_dir@|$(PKG_LIBEXEC_DIR)|g" \
+                  < $< > $@
 
 include $(BUILDRULES)
 
@@ -32,6 +48,10 @@ install-healer: default
        $(INSTALL) -m 755 -d $(PKG_LIBEXEC_DIR)
        $(INSTALL) -m 755 $(LTCOMMAND) $(PKG_LIBEXEC_DIR)
 
+install-systemd: default
+       $(INSTALL) -m 755 -d $(SYSTEMD_SYSTEM_UNIT_DIR)
+       $(INSTALL) -m 644 $(SYSTEMD_SERVICES) $(SYSTEMD_SYSTEM_UNIT_DIR)
+
 install-dev:
 
 -include .dep
diff --git a/healer/system-xfs_healer.slice b/healer/system-xfs_healer.slice
new file mode 100644 (file)
index 0000000..b8f5bca
--- /dev/null
@@ -0,0 +1,31 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Copyright (c) 2024-2026 Oracle.  All Rights Reserved.
+# Author: Darrick J. Wong <djwong@kernel.org>
+
+[Unit]
+Description=xfs_healer background service slice
+Before=slices.target
+
+[Slice]
+
+# If the CPU usage cgroup controller is available, don't use more than 2 cores
+# for all background processes.  One thread to read events, another to run
+# repairs.
+CPUQuota=200%
+CPUAccounting=true
+
+[Install]
+# As of systemd 249, the systemd cgroupv2 configuration code will drop resource
+# controllers from the root and system.slice cgroups at startup if it doesn't
+# find any direct dependencies that require a given controller.  Newly
+# activated units with resource control directives are created under the system
+# slice but do not cause a reconfiguration of the slice's resource controllers.
+# Hence we cannot put CPUQuota= into the xfs_healer service units directly.
+#
+# For the CPUQuota directive to have any effect, we must therefore create an
+# explicit definition file for the slice that systemd creates to contain the
+# xfs_healer instance units (e.g. xfs_healer@.service) and we must configure
+# this slice as a dependency of the system slice to establish the direct
+# dependency relation.
+WantedBy=system.slice
index 71afa0331c8c7e29d78c12e30d1a1cf921e6e69c..d9ab53c153a456c2f237501961c85595c0d6db2a 100644 (file)
@@ -424,6 +424,7 @@ enum long_opt_nr {
        LOPT_HELP,
        LOPT_QUIET,
        LOPT_REPAIR,
+       LOPT_SVCNAME,
 
        LOPT_MAX,
 };
@@ -455,6 +456,7 @@ main(
                [LOPT_HELP]        = {"help", no_argument, NULL, 0 },
                [LOPT_QUIET]       = {"quiet", no_argument, &ctx.log, 0 },
                [LOPT_REPAIR]      = {"repair", no_argument, &ctx.want_repair, 1 },
+               [LOPT_SVCNAME]     = {"svcname", no_argument, &ctx.print_svcname, 1 },
 
                [LOPT_MAX]         = {NULL, 0, NULL, 0 },
        };
@@ -491,6 +493,20 @@ main(
 
        ctx.mntpoint = argv[optind];
 
+       if (ctx.print_svcname) {
+               char    unitname[PATH_MAX];
+
+               ret = systemd_path_instance_unit_name(XFS_HEALER_SVCNAME,
+                               ctx.mntpoint, unitname, sizeof(unitname));
+               if (ret) {
+                       perror(ctx.mntpoint);
+                       return EXIT_FAILURE;
+               }
+
+               printf("%s\n", unitname);
+               return EXIT_SUCCESS;
+       }
+
        ret = setup_monitor(&ctx);
        if (ret)
                goto out_events;
index 6d12921245934c2a5b6a0fd838975e83b5f487c0..679bdc95ae48f84f489e7b16a9667337aa407206 100644 (file)
@@ -26,6 +26,7 @@ struct healer_ctx {
        int                     everything;
        int                     foreground;
        int                     want_repair;
+       int                     print_svcname;
 
        /* fd and fs geometry for mount */
        struct xfs_fd           mnt;
diff --git a/healer/xfs_healer@.service.in b/healer/xfs_healer@.service.in
new file mode 100644 (file)
index 0000000..3852578
--- /dev/null
@@ -0,0 +1,107 @@
+# SPDX-License-Identifier: GPL-2.0-or-later
+#
+# Copyright (c) 2024-2026 Oracle.  All Rights Reserved.
+# Author: Darrick J. Wong <djwong@kernel.org>
+
+[Unit]
+Description=Self Healing of XFS Metadata for %f
+
+# Explicitly require the capabilities that this program needs
+ConditionCapability=CAP_SYS_ADMIN
+ConditionCapability=CAP_DAC_OVERRIDE
+
+# Must be a mountpoint
+ConditionPathIsMountPoint=%f
+RequiresMountsFor=%f
+
+[Service]
+Type=exec
+Environment=SERVICE_MODE=1
+ExecStart=@pkg_libexec_dir@/xfs_healer %f
+SyslogIdentifier=%N
+
+# Create the service underneath the healer background service slice so that we
+# can control resource usage.
+Slice=system-xfs_healer.slice
+
+# No realtime CPU scheduling
+RestrictRealtime=true
+
+# xfs_healer avoids pinning mounted filesystems by recording the file handle
+# for the provided mountpoint (%f) before opening the health monitor, after
+# which it closes the fd for the mountpoint.  If repairs are needed, it will
+# reopen the mountpoint, resample the file handle, and proceed only if the
+# handles match.  If the filesystem is unmounted, the daemon exits.  If the
+# mountpoint moves, repairs will not be attempted against the wrong filesystem.
+#
+# Due to this resampling behavior, xfs_healer must see the same filesystem
+# mount tree inside the service container as outside, with the same ro/rw
+# state.  BindPaths doesn't work on the paths that are made readonly by
+# ProtectSystem and ProtectHome, so it is not possible to set either option.
+# DynamicUser sets ProtectSystem, so that also cannot be used.  We cannot use
+# BindPaths to bind the desired mountpoint somewhere under /tmp like xfs_scrub
+# does because that pins the mount.
+#
+# Regrettably, this leaves xfs_healer less hardened than xfs_scrub.
+# Surprisingly, this doesn't affect xfs_healer's score dramatically.
+DynamicUser=false
+ProtectSystem=false
+ProtectHome=no
+PrivateTmp=true
+PrivateDevices=true
+
+# Don't let healer complain about paths in /etc/projects that have been hidden
+# by our sandboxing.  healer doesn't care about project ids anyway.
+InaccessiblePaths=-/etc/projects
+
+# No network access
+PrivateNetwork=true
+ProtectHostname=true
+RestrictAddressFamilies=none
+IPAddressDeny=any
+
+# Don't let the program mess with the kernel configuration at all
+ProtectKernelLogs=true
+ProtectKernelModules=true
+ProtectKernelTunables=true
+ProtectControlGroups=true
+ProtectProc=invisible
+RestrictNamespaces=true
+
+# Hide everything in /proc, even /proc/mounts
+ProcSubset=pid
+
+# Only allow the default personality Linux
+LockPersonality=true
+
+# No writable memory pages
+MemoryDenyWriteExecute=true
+
+# Don't let our mounts leak out to the host
+PrivateMounts=true
+
+# Restrict system calls to the native arch and only enough to get things going
+SystemCallArchitectures=native
+SystemCallFilter=@system-service
+SystemCallFilter=~@privileged
+SystemCallFilter=~@resources
+SystemCallFilter=~@mount
+
+# xfs_healer needs these privileges to open the rootdir and monitor
+CapabilityBoundingSet=CAP_SYS_ADMIN CAP_DAC_OVERRIDE
+AmbientCapabilities=CAP_SYS_ADMIN CAP_DAC_OVERRIDE
+NoNewPrivileges=true
+
+# xfs_healer doesn't create files
+UMask=7777
+
+# No access to hardware /dev files except for block devices
+ProtectClock=true
+DevicePolicy=closed
+
+[Install]
+WantedBy=multi-user.target
+# If someone tries to enable the template itself, translate that into enabling
+# this service on the root directory at systemd startup time.  In the
+# initramfs, the udev rules in xfs_healer.rules run before systemd starts.
+DefaultInstance=-