xfs_healer: create daemon to listen for health events

author Darrick J. Wong <djwong@kernel.org>

Sun, 22 Feb 2026 22:41:14 +0000 (14:41 -0800)

committer Darrick J. Wong <djwong@kernel.org>

Thu, 9 Apr 2026 22:30:17 +0000 (15:30 -0700)
author Darrick J. Wong <djwong@kernel.org>
Sun, 22 Feb 2026 22:41:14 +0000 (14:41 -0800)
committer Darrick J. Wong <djwong@kernel.org>
Thu, 9 Apr 2026 22:30:17 +0000 (15:30 -0700)
diff --git a/Makefile b/Makefile

index c73aa391bc5f43a4e4fc957c456013b1db7fb798..1f499c30f3457e950a9ed8ed9663d2fee9b84091 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -69,6 +69,10 @@ ifeq ("$(ENABLE_SCRUB)","yes")
  TOOL_SUBDIRS += scrub
  endif
  
+ifeq ("$(ENABLE_HEALER)","yes")
+TOOL_SUBDIRS += healer
+endif
+
  ifneq ("$(XGETTEXT)","")
  TOOL_SUBDIRS += po
  endif
@@ -100,6 +104,7 @@ mkfs: libxcmd
  spaceman: libxcmd libhandle
  scrub: libhandle libxcmd
  rtcp: libfrog
+healer: libhandle
  
  ifeq ($(HAVE_BUILDDEFS), yes)
  include $(BUILDRULES)
diff --git a/configure.ac b/configure.ac

index cffcaf373cfa5e96914c6303955ea2f2ba13f2c5..90af1f84035ee60544d0c315260b6050d38f0702 100644 (file)
--- a/configure.ac
+++ b/configure.ac
@@ -110,6 +110,12 @@ AC_ARG_ENABLE(libicu,
  [  --enable-libicu=[yes/no]  Enable Unicode name scanning in xfs_scrub (libicu) [default=probe]],,
         enable_libicu=probe)
  
+# Enable xfs_healer build
+AC_ARG_ENABLE(healer,
+[  --enable-healer=[yes/no]  Enable build of xfs_healer utility [[default=yes]]],,
+       enable_healer=yes)
+AC_SUBST(enable_healer)
+
  #
  # If the user specified a libdir ending in lib64 do not append another
  # 64 to the library names.
diff --git a/healer/Makefile b/healer/Makefile

new file mode 100644 (file)

index 0000000..e82c820
--- /dev/null
+++ b/healer/Makefile
@@ -0,0 +1,35 @@
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (C) 2024-2026 Oracle.  All Rights Reserved.
+#
+
+TOPDIR = ..
+builddefs=$(TOPDIR)/include/builddefs
+include $(builddefs)
+
+INSTALL_HEALER = install-healer
+
+LTCOMMAND = xfs_healer
+
+CFILES = \
+xfs_healer.c
+
+HFILES = \
+xfs_healer.h
+
+LLDLIBS += $(LIBHANDLE) $(LIBFROG) $(LIBURCU) $(LIBPTHREAD)
+LTDEPENDENCIES += $(LIBHANDLE) $(LIBFROG)
+LLDFLAGS = -static
+
+default: depend $(LTCOMMAND)
+
+include $(BUILDRULES)
+
+install: $(INSTALL_HEALER)
+
+install-healer: default
+       $(INSTALL) -m 755 -d $(PKG_LIBEXEC_DIR)
+       $(INSTALL) -m 755 $(LTCOMMAND) $(PKG_LIBEXEC_DIR)
+
+install-dev:
+
+-include .dep
diff --git a/healer/xfs_healer.c b/healer/xfs_healer.c

new file mode 100644 (file)

index 0000000..076d535
--- /dev/null
+++ b/healer/xfs_healer.c
@@ -0,0 +1,404 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2025-2026 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include <pthread.h>
+#include <stdlib.h>
+
+#include "platform_defs.h"
+#include "libfrog/fsgeom.h"
+#include "libfrog/paths.h"
+#include "libfrog/healthevent.h"
+#include "libfrog/workqueue.h"
+#include "libfrog/systemd.h"
+#include "xfs_healer.h"
+
+/* Program name; needed for libfrog error reports. */
+char                           *progname = "xfs_healer";
+
+/* Return a health monitoring fd. */
+static int
+open_health_monitor(
+       struct healer_ctx               *ctx,
+       int                             mnt_fd)
+{
+       struct xfs_health_monitor       hmo = {
+               .format                 = XFS_HEALTH_MONITOR_FMT_V0,
+       };
+
+       if (ctx->everything)
+               hmo.flags |= XFS_HEALTH_MONITOR_VERBOSE;
+
+       return ioctl(mnt_fd, XFS_IOC_HEALTH_MONITOR, &hmo);
+}
+
+/* Decide if this event can only be reported upon, and not acted upon. */
+static bool
+event_not_actionable(
+       const struct xfs_health_monitor_event   *hme)
+{
+       switch (hme->type) {
+       case XFS_HEALTH_MONITOR_TYPE_LOST:
+       case XFS_HEALTH_MONITOR_TYPE_RUNNING:
+       case XFS_HEALTH_MONITOR_TYPE_UNMOUNT:
+       case XFS_HEALTH_MONITOR_TYPE_SHUTDOWN:
+               return true;
+       }
+
+       return false;
+}
+
+/* Should this event be logged? */
+static bool
+event_loggable(
+       const struct healer_ctx                 *ctx,
+       const struct xfs_health_monitor_event   *hme)
+{
+       return ctx->log || event_not_actionable(hme);
+}
+
+/* Handle an event asynchronously. */
+static void
+handle_event(
+       struct workqueue                *wq,
+       uint32_t                        index,
+       void                            *arg)
+{
+       struct hme_prefix               pfx;
+       struct xfs_health_monitor_event *hme = arg;
+       struct healer_ctx               *ctx = wq->wq_ctx;
+       const bool loggable = event_loggable(ctx, hme);
+
+       hme_prefix_init(&pfx, ctx->mntpoint);
+
+       /*
+        * Non-actionable events should always be logged, because they are 100%
+        * informational.
+        */
+       if (loggable) {
+               pthread_mutex_lock(&ctx->conlock);
+               hme_report_event(&pfx, hme);
+               pthread_mutex_unlock(&ctx->conlock);
+       }
+
+       free(hme);
+}
+
+/*
+ * Find the filesystem source name for the mount that we're monitoring.  We
+ * don't use the fs_table_ helpers because we might be running in a restricted
+ * environment where we cannot access device files at all.
+ */
+static int
+try_capture_fsinfo(
+       struct healer_ctx       *ctx)
+{
+       struct mntent           *mnt;
+       FILE                    *mtp;
+       char                    rpath[PATH_MAX], rmnt_dir[PATH_MAX];
+
+       if (!realpath(ctx->mntpoint, rpath))
+               return -1;
+
+       mtp = setmntent(_PATH_PROC_MOUNTS, "r");
+       if (mtp == NULL)
+               return -1;
+
+       while ((mnt = getmntent(mtp)) != NULL) {
+               if (strcmp(mnt->mnt_type, "xfs"))
+                       continue;
+               if (!realpath(mnt->mnt_dir, rmnt_dir))
+                       continue;
+
+               if (!strcmp(rpath, rmnt_dir)) {
+                       ctx->fsname = strdup(mnt->mnt_fsname);
+                       break;
+               }
+       }
+
+       endmntent(mtp);
+
+       return ctx->fsname ? 0 : -1;
+}
+
+static unsigned int
+healer_nproc(
+       const struct healer_ctx *ctx)
+{
+       /*
+        * By default, use one event handler thread.  In foreground mode,
+        * create one thread per cpu.
+        */
+       return ctx->foreground ? platform_nproc() : 1;
+}
+
+/* Set ourselves up to monitor the given mountpoint for health events. */
+static int
+setup_monitor(
+       struct healer_ctx       *ctx)
+{
+       const long              BUF_SIZE = sysconf(_SC_PAGE_SIZE) * 2;
+       int                     mon_fd;
+       int                     ret;
+
+       ret = xfd_open(&ctx->mnt, ctx->mntpoint, O_RDONLY);
+       if (ret) {
+               perror(ctx->mntpoint);
+               return -1;
+       }
+
+       ret = try_capture_fsinfo(ctx);
+       if (ret) {
+               fprintf(stderr, "%s: %s\n", ctx->mntpoint,
+                               _("Not a XFS mount point."));
+               goto out_mnt_fd;
+       }
+
+       /*
+        * Open the health monitor, then close the mountpoint to avoid pinning
+        * it.  We can reconnect later if need be.
+        */
+       mon_fd = open_health_monitor(ctx, ctx->mnt.fd);
+       if (mon_fd < 0) {
+               switch (errno) {
+               case ENOTTY:
+               case EOPNOTSUPP:
+                       fprintf(stderr, "%s: %s\n", ctx->mntpoint,
+ _("XFS health monitoring not supported."));
+                       break;
+               case EEXIST:
+                       fprintf(stderr, "%s: %s\n", ctx->mntpoint,
+ _("XFS health monitoring already running."));
+                       break;
+               default:
+                       perror(ctx->mntpoint);
+                       break;
+               }
+
+               goto out_mnt_fd;
+       }
+       close(ctx->mnt.fd);
+       ctx->mnt.fd = -1;
+
+       /*
+        * mon_fp consumes mon_fd.  We intentionally leave mon_fp attached to
+        * the context so that we keep the monitoring fd open until we've torn
+        * down all the background threads.
+        */
+       ctx->mon_fp = fdopen(mon_fd, "r");
+       if (!ctx->mon_fp) {
+               perror(ctx->mntpoint);
+               goto out_mon_fd;
+       }
+
+       /* Increase the buffer size so that we can reduce kernel calls */
+       ctx->mon_buf = malloc(BUF_SIZE);
+       if (ctx->mon_buf)
+               setvbuf(ctx->mon_fp, ctx->mon_buf, _IOFBF, BUF_SIZE);
+
+       /*
+        * Queue up to 1MB of events before we stop trying to read events from
+        * the kernel as quickly as we can.  Note that the kernel won't accrue
+        * more than 32K of internal events before it starts dropping them.
+        */
+       ret = workqueue_create_bound(&ctx->event_queue, ctx, healer_nproc(ctx),
+                       1048576 / sizeof(struct xfs_health_monitor_event));
+       if (ret) {
+               errno = ret;
+               fprintf(stderr, "%s: %s: %s\n", ctx->mntpoint,
+                               _("worker threadpool setup"), strerror(errno));
+               goto out_mon_fp;
+       }
+       ctx->queue_active = true;
+
+       return 0;
+
+out_mon_fp:
+       if (ctx->mon_fp)
+               fclose(ctx->mon_fp);
+       ctx->mon_fp = NULL;
+out_mon_fd:
+       if (mon_fd >= 0)
+               close(mon_fd);
+out_mnt_fd:
+       if (ctx->mnt.fd >= 0)
+               close(ctx->mnt.fd);
+       ctx->mnt.fd = -1;
+       return -1;
+}
+
+/* Monitor the given mountpoint for health events. */
+static int
+monitor(
+       struct healer_ctx       *ctx)
+{
+       bool                    mounted = true;
+       size_t                  nr;
+       int                     ret = 0;
+
+       do {
+               struct xfs_health_monitor_event *hme;
+
+               hme = malloc(sizeof(*hme));
+               if (!hme) {
+                       pthread_mutex_lock(&ctx->conlock);
+                       fprintf(stderr, "%s: %s\n", ctx->mntpoint,
+                                       _("could not allocate event object"));
+                       pthread_mutex_unlock(&ctx->conlock);
+                       ret = -1;
+                       break;
+               }
+
+               nr = fread(hme, sizeof(*hme), 1, ctx->mon_fp);
+               if (ferror(ctx->mon_fp)) {
+                       pthread_mutex_lock(&ctx->conlock);
+                       fprintf(stderr, "%s: %s: %s\n", ctx->mntpoint,
+                                       _("error reading event file"),
+                                       strerror(ret));
+                       pthread_mutex_unlock(&ctx->conlock);
+                       free(hme);
+                       ret = -1;
+                       break;
+               }
+               if (nr == 0) {
+                       free(hme);
+                       break;
+               }
+
+               if (hme->type == XFS_HEALTH_MONITOR_TYPE_UNMOUNT)
+                       mounted = false;
+
+               /* handle_event owns hme if the workqueue_add succeeds */
+               ret = workqueue_add(&ctx->event_queue, handle_event, 0, hme);
+               if (ret) {
+                       pthread_mutex_lock(&ctx->conlock);
+                       fprintf(stderr, "%s: %s: %s\n", ctx->mntpoint,
+                                       _("could not queue event object"),
+                                       strerror(ret));
+                       pthread_mutex_unlock(&ctx->conlock);
+                       free(hme);
+                       break;
+               }
+       } while (nr > 0 && mounted);
+
+       return ret;
+}
+
+/* Tear down all the resources that we created for monitoring */
+static void
+teardown_monitor(
+       struct healer_ctx       *ctx)
+{
+       if (ctx->queue_active) {
+               workqueue_terminate(&ctx->event_queue);
+               workqueue_destroy(&ctx->event_queue);
+       }
+       if (ctx->mon_fp) {
+               fclose(ctx->mon_fp);
+               ctx->mon_fp = NULL;
+       }
+       free(ctx->mon_buf);
+       ctx->mon_buf = NULL;
+}
+
+static void __attribute__((noreturn))
+usage(void)
+{
+       fprintf(stderr, "%s %s %s\n", _("Usage:"), progname,
+                       _("[OPTIONS] mountpoint"));
+       fprintf(stderr, "\n");
+       fprintf(stderr, _("Options:\n"));
+       fprintf(stderr, _("  --debug       Enable debugging messages.\n"));
+       fprintf(stderr, _("  --everything  Capture all events.\n"));
+       fprintf(stderr, _("  --foreground  Process events as soon as possible.\n"));
+       fprintf(stderr, _("  --quiet       Do not log health events to stdout.\n"));
+       fprintf(stderr, _("  -V            Print version.\n"));
+
+       exit(EXIT_FAILURE);
+}
+
+enum long_opt_nr {
+       LOPT_DEBUG,
+       LOPT_EVERYTHING,
+       LOPT_FOREGROUND,
+       LOPT_HELP,
+       LOPT_QUIET,
+
+       LOPT_MAX,
+};
+
+int
+main(
+       int                     argc,
+       char                    **argv)
+{
+       struct healer_ctx       ctx = {
+               .conlock        = (pthread_mutex_t)PTHREAD_MUTEX_INITIALIZER,
+               .log            = 1,
+               .mnt.fd         = -1,
+       };
+       int                     option_index;
+       int                     vflag = 0;
+       int                     c;
+       int                     ret;
+
+       progname = basename(argv[0]);
+       setlocale(LC_ALL, "");
+       bindtextdomain(PACKAGE, LOCALEDIR);
+       textdomain(PACKAGE);
+
+       struct option long_options[] = {
+               [LOPT_DEBUG]       = {"debug", no_argument, &ctx.debug, 1 },
+               [LOPT_EVERYTHING]  = {"everything", no_argument, &ctx.everything, 1 },
+               [LOPT_FOREGROUND]  = {"foreground", no_argument, &ctx.foreground, 1 },
+               [LOPT_HELP]        = {"help", no_argument, NULL, 0 },
+               [LOPT_QUIET]       = {"quiet", no_argument, &ctx.log, 0 },
+
+               [LOPT_MAX]         = {NULL, 0, NULL, 0 },
+       };
+
+       while ((c = getopt_long(argc, argv, "V", long_options, &option_index))
+                       != EOF) {
+               switch (c) {
+               case 0:
+                       switch (option_index) {
+                       case LOPT_HELP:
+                               usage();
+                               break;
+                       default:
+                               break;
+                       }
+                       break;
+               case 'V':
+                       vflag++;
+                       break;
+               default:
+                       usage();
+                       break;
+               }
+       }
+
+       if (vflag) {
+               fprintf(stdout, "%s %s %s\n", progname, _("version"), VERSION);
+               fflush(stdout);
+               return EXIT_SUCCESS;
+       }
+
+       if (optind != argc - 1)
+               usage();
+
+       ctx.mntpoint = argv[optind];
+
+       ret = setup_monitor(&ctx);
+       if (ret)
+               goto out_events;
+
+       ret = monitor(&ctx);
+
+out_events:
+       teardown_monitor(&ctx);
+       free((char *)ctx.fsname);
+       return systemd_service_exit(ret);
+}
diff --git a/healer/xfs_healer.h b/healer/xfs_healer.h

new file mode 100644 (file)

index 0000000..bcddde5
--- /dev/null
+++ b/healer/xfs_healer.h
@@ -0,0 +1,47 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2025-2026 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef XFS_HEALER_XFS_HEALER_H_
+#define XFS_HEALER_XFS_HEALER_H_
+
+extern char *progname;
+
+/*
+ * When running in environments with restrictive security policies, healer
+ * might not be allowed to access the global mount tree.  However, processes
+ * are usually still allowed to see their own mount tree, so use this path for
+ * all mount table queries.
+ */
+#define _PATH_PROC_MOUNTS      "/proc/self/mounts"
+
+struct healer_ctx {
+       /* CLI options, must be int */
+       int                     debug;
+       int                     log;
+       int                     everything;
+       int                     foreground;
+
+       /* fd and fs geometry for mount */
+       struct xfs_fd           mnt;
+
+       /* Shared reference to the user's mountpoint for logging */
+       const char              *mntpoint;
+
+       /* Shared reference to the getmntent fsname for reconnecting */
+       const char              *fsname;
+
+       /* file stream of monitor and buffer */
+       FILE                    *mon_fp;
+       char                    *mon_buf;
+
+       /* coordinates logging printfs */
+       pthread_mutex_t         conlock;
+
+       /* event queue */
+       struct workqueue        event_queue;
+       bool                    queue_active;
+};
+
+#endif /* XFS_HEALER_XFS_HEALER_H_ */
diff --git a/include/builddefs.in b/include/builddefs.in

index d2d25c8a0ed676f1d742a49fc04e75ee4476ca3e..0ab2bf1702f0f0038f984fcec338094567d1310c 100644 (file)
--- a/include/builddefs.in
+++ b/include/builddefs.in
@@ -91,6 +91,7 @@ ENABLE_SHARED = @enable_shared@
  ENABLE_GETTEXT = @enable_gettext@
  ENABLE_EDITLINE        = @enable_editline@
  ENABLE_SCRUB   = @enable_scrub@
+ENABLE_HEALER  = @enable_healer@
  
  HAVE_ZIPPED_MANPAGES = @have_zipped_manpages@
author	Darrick J. Wong <djwong@kernel.org>
	Sun, 22 Feb 2026 22:41:14 +0000 (14:41 -0800)
committer	Darrick J. Wong <djwong@kernel.org>
	Thu, 9 Apr 2026 22:30:17 +0000 (15:30 -0700)
Makefile		patch \| blob \| history
configure.ac		patch \| blob \| history
healer/Makefile	[new file with mode: 0644]	patch \| blob
healer/xfs_healer.c	[new file with mode: 0644]	patch \| blob
healer/xfs_healer.h	[new file with mode: 0644]	patch \| blob
include/builddefs.in		patch \| blob \| history