]> git-server-git.apps.pok.os.sepia.ceph.com Git - xfsprogs-dev.git/commitdiff
xfs_healer: run full scrub after lost corruption events or targeted repair failure
authorDarrick J. Wong <djwong@kernel.org>
Sun, 22 Feb 2026 22:41:16 +0000 (14:41 -0800)
committerDarrick J. Wong <djwong@kernel.org>
Thu, 9 Apr 2026 22:30:17 +0000 (15:30 -0700)
If we fail to perform a spot repair of metadata or the kernel tells us
that it lost corruption events due to queue limits, initiate a full run
of the online fsck service to try to fix the error.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
healer/Makefile
healer/fsrepair.c
healer/weakhandle.c
healer/xfs_healer.c
healer/xfs_healer.h
include/builddefs.in
scrub/Makefile

index 1eeb727682008b51da140a68fe80e8c8304ff326..b8ffce33e90d185e8bb916e4e60ad01d18a8e9bd 100644 (file)
@@ -19,6 +19,8 @@ xfs_healer.c
 HFILES = \
 xfs_healer.h
 
+CFLAGS+=-DXFS_SCRUB_SVCNAME=\"$(XFS_SCRUB_SVCNAME)\"
+
 LLDLIBS += $(LIBHANDLE) $(LIBFROG) $(LIBURCU) $(LIBPTHREAD)
 LTDEPENDENCIES += $(LIBHANDLE) $(LIBFROG)
 LLDFLAGS = -static
index 4534104f8a6ac14293e9089150e46e7f46a51472..9f8c128e395ebc7ef6d395547589cb504e5bbdc3 100644 (file)
@@ -9,8 +9,14 @@
 #include "libfrog/fsgeom.h"
 #include "libfrog/workqueue.h"
 #include "libfrog/healthevent.h"
+#include "libfrog/systemd.h"
 #include "xfs_healer.h"
 
+enum what_next {
+       NEED_FULL_REPAIR,
+       REPAIR_DONE,
+};
+
 /* Translate scrub output flags to outcome. */
 static enum repair_outcome from_repair_oflags(uint32_t oflags)
 {
@@ -61,7 +67,7 @@ xfs_repair_metadata(
 }
 
 /* React to a fs-domain corruption event by repairing it. */
-static void
+static enum what_next
 try_repair_wholefs(
        struct healer_ctx                       *ctx,
        const struct hme_prefix                 *pfx,
@@ -90,11 +96,16 @@ try_repair_wholefs(
                pthread_mutex_lock(&ctx->conlock);
                report_health_repair(pfx, hme, f->event_mask, outcome);
                pthread_mutex_unlock(&ctx->conlock);
+
+               if (outcome == REPAIR_FAILED)
+                       return NEED_FULL_REPAIR;
        }
+
+       return REPAIR_DONE;
 }
 
 /* React to an ag corruption event by repairing it. */
-static void
+static enum what_next
 try_repair_ag(
        struct healer_ctx                       *ctx,
        const struct hme_prefix                 *pfx,
@@ -126,11 +137,16 @@ try_repair_ag(
                pthread_mutex_lock(&ctx->conlock);
                report_health_repair(pfx, hme, f->event_mask, outcome);
                pthread_mutex_unlock(&ctx->conlock);
+
+               if (outcome == REPAIR_FAILED)
+                       return NEED_FULL_REPAIR;
        }
+
+       return REPAIR_DONE;
 }
 
 /* React to a rtgroup corruption event by repairing it. */
-static void
+static enum what_next
 try_repair_rtgroup(
        struct healer_ctx                       *ctx,
        const struct hme_prefix                 *pfx,
@@ -157,11 +173,16 @@ try_repair_rtgroup(
                pthread_mutex_lock(&ctx->conlock);
                report_health_repair(pfx, hme, f->event_mask, outcome);
                pthread_mutex_unlock(&ctx->conlock);
+
+               if (outcome == REPAIR_FAILED)
+                       return NEED_FULL_REPAIR;
        }
+
+       return REPAIR_DONE;
 }
 
 /* React to a inode-domain corruption event by repairing it. */
-static void
+static enum what_next
 try_repair_inode(
        struct healer_ctx                       *ctx,
        const struct hme_prefix                 *orig_pfx,
@@ -204,7 +225,12 @@ try_repair_inode(
                pthread_mutex_lock(&ctx->conlock);
                report_health_repair(pfx, hme, f->event_mask, outcome);
                pthread_mutex_unlock(&ctx->conlock);
+
+               if (outcome == REPAIR_FAILED)
+                       return NEED_FULL_REPAIR;
        }
+
+       return REPAIR_DONE;
 }
 
 /* Repair a metadata corruption. */
@@ -214,6 +240,7 @@ repair_metadata(
        const struct hme_prefix                 *pfx,
        const struct xfs_health_monitor_event   *hme)
 {
+       enum what_next                          what_next;
        int                                     repair_fd;
        int                                     ret;
 
@@ -227,19 +254,25 @@ repair_metadata(
 
        switch (hme->domain) {
        case XFS_HEALTH_MONITOR_DOMAIN_FS:
-               try_repair_wholefs(ctx, pfx, repair_fd, hme);
+               what_next = try_repair_wholefs(ctx, pfx, repair_fd, hme);
                break;
        case XFS_HEALTH_MONITOR_DOMAIN_AG:
-               try_repair_ag(ctx, pfx, repair_fd, hme);
+               what_next = try_repair_ag(ctx, pfx, repair_fd, hme);
                break;
        case XFS_HEALTH_MONITOR_DOMAIN_RTGROUP:
-               try_repair_rtgroup(ctx, pfx, repair_fd, hme);
+               what_next = try_repair_rtgroup(ctx, pfx, repair_fd, hme);
                break;
        case XFS_HEALTH_MONITOR_DOMAIN_INODE:
-               try_repair_inode(ctx, pfx, repair_fd, hme);
+               what_next = try_repair_inode(ctx, pfx, repair_fd, hme);
                break;
+       default:
+               what_next = REPAIR_DONE;
        }
 
+       /* Transform into a full repair if we failed to fix this item. */
+       if (what_next == NEED_FULL_REPAIR)
+               run_full_repair(ctx);
+
        close(repair_fd);
        return 0;
 }
@@ -259,3 +292,35 @@ healer_can_repair(
        ret = ioctl(ctx->mnt.fd, XFS_IOC_SCRUB_METADATA, &sm);
        return ret ? false : true;
 }
+
+/* Run a full repair of the filesystem using the background fsck service. */
+void
+run_full_repair(
+       struct healer_ctx       *ctx)
+{
+       char                    unitname[PATH_MAX];
+       int                     ret;
+
+       ret = weakhandle_instance_unit_name(ctx->wh, XFS_SCRUB_SVCNAME,
+                       unitname, PATH_MAX);
+       if (ret) {
+               fprintf(stderr, "%s: %s\n", ctx->mntpoint,
+                               _("Could not determine xfs_scrub unit name."));
+               return;
+       }
+
+       /*
+        * Scrub could already be repairing something, so try to start the unit
+        * and be content if it's already running.
+        */
+       ret = systemd_manage_unit(UM_START, unitname);
+       if (ret) {
+               fprintf(stderr, "%s: %s: %s\n", ctx->mntpoint,
+                               _("Could not start xfs_scrub service unit"),
+                               unitname);
+               return;
+       }
+
+       printf("%s: %s\n", ctx->mntpoint, _("Full repairs in progress."));
+       fflush(stdout);
+}
index 8950e0eb1e5a43e56c9fef56d6795765b28a3444..849aa2882700d463a3953952204db8904fa9d6aa 100644 (file)
@@ -13,6 +13,7 @@
 #include "libfrog/workqueue.h"
 #include "libfrog/getparents.h"
 #include "libfrog/paths.h"
+#include "libfrog/systemd.h"
 #include "xfs_healer.h"
 
 struct weakhandle {
@@ -199,3 +200,15 @@ weakhandle_getpath_for(
        close(mnt_fd);
        return ret;
 }
+
+/* Compute the systemd instance unit name for this mountpoint. */
+int
+weakhandle_instance_unit_name(
+       struct weakhandle       *wh,
+       const char              *template,
+       char                    *unitname,
+       size_t                  unitnamelen)
+{
+       return systemd_path_instance_unit_name(template, wh->mntpoint,
+                       unitname, unitnamelen);
+}
index 95e952ed5cb94023715ca1582f6c19d0dc1b3e91..d885eb0a50826b917fd69c9a4bcd8328db0b0e0c 100644 (file)
@@ -142,6 +142,15 @@ handle_event(
                pthread_mutex_unlock(&ctx->conlock);
        }
 
+       /*
+        * If we didn't ask for all the metadata reports (including the healthy
+        * ones) and the kernel tells us it lost something, run a full repair
+        * if we're expected to fix things.
+        */
+       if (hme->type == XFS_HEALTH_MONITOR_TYPE_LOST && !ctx->everything &&
+           ctx->want_repair)
+               run_full_repair(ctx);
+
        /* Initiate a repair if appropriate. */
        if (will_repair)
                repair_metadata(ctx, &pfx, hme);
index a2a46053928e337b5d2f518bf922ae1b4de82b69..e1370323bbd66a5f6245e47c98390f0842afd9d1 100644 (file)
@@ -72,6 +72,7 @@ void lookup_path(struct healer_ctx *ctx,
 int repair_metadata(struct healer_ctx *ctx, const struct hme_prefix *pfx,
                const struct xfs_health_monitor_event *hme);
 bool healer_can_repair(struct healer_ctx *ctx);
+void run_full_repair(struct healer_ctx *ctx);
 
 /* weakhandle.c */
 int weakhandle_alloc(int fd, const char *mountpoint, const char *fsname,
@@ -80,5 +81,7 @@ int weakhandle_reopen(struct weakhandle *wh, int *fd);
 void weakhandle_free(struct weakhandle **whp);
 int weakhandle_getpath_for(struct weakhandle *wh, uint64_t ino, uint32_t gen,
                char *path, size_t pathlen);
+int weakhandle_instance_unit_name(struct weakhandle *wh, const char *template,
+               char *unitname, size_t unitnamelen);
 
 #endif /* XFS_HEALER_XFS_HEALER_H_ */
index bdba9cd9037900a8452c927751ed8b15c8f27147..3b52d1afd7031c7741ae48256d7c9fea61ca9aa3 100644 (file)
@@ -62,6 +62,7 @@ MKFS_CFG_DIR  = @datadir@/@pkg_name@/mkfs
 PKG_STATE_DIR  = @localstatedir@/lib/@pkg_name@
 
 XFS_SCRUB_ALL_AUTO_MEDIA_SCAN_STAMP=$(PKG_STATE_DIR)/xfs_scrub_all_media.stamp
+XFS_SCRUB_SVCNAME=xfs_scrub@.service
 
 CC             = @cc@
 BUILD_CC       = @BUILD_CC@
index ff79a265762332e6d6ba8a3b00ea1bb32b998b0b..aee49bfce100e22c073d1da49f56cb18f2dd5ad6 100644 (file)
@@ -8,7 +8,6 @@ include $(builddefs)
 
 SCRUB_PREREQS=$(HAVE_GETFSMAP)
 
-scrub_svcname=xfs_scrub@.service
 scrub_media_svcname=xfs_scrub_media@.service
 
 ifeq ($(SCRUB_PREREQS),yes)
@@ -21,7 +20,7 @@ XFS_SCRUB_SERVICE_ARGS = -b -o autofsck
 ifeq ($(HAVE_SYSTEMD),yes)
 INSTALL_SCRUB += install-systemd
 SYSTEMD_SERVICES=\
-       $(scrub_svcname) \
+       $(XFS_SCRUB_SVCNAME) \
        xfs_scrub_fail@.service \
        $(scrub_media_svcname) \
        xfs_scrub_media_fail@.service \
@@ -123,7 +122,7 @@ xfs_scrub_all.timer: xfs_scrub_all.timer.in $(builddefs)
 $(XFS_SCRUB_ALL_PROG): $(XFS_SCRUB_ALL_PROG).in $(builddefs) $(TOPDIR)/libfrog/gettext.py
        @echo "    [SED]    $@"
        $(Q)$(SED) -e "s|@sbindir@|$(PKG_SBIN_DIR)|g" \
-                  -e "s|@scrub_svcname@|$(scrub_svcname)|g" \
+                  -e "s|@scrub_svcname@|$(XFS_SCRUB_SVCNAME)|g" \
                   -e "s|@scrub_media_svcname@|$(scrub_media_svcname)|g" \
                   -e "s|@pkg_version@|$(PKG_VERSION)|g" \
                   -e "s|@stampfile@|$(XFS_SCRUB_ALL_AUTO_MEDIA_SCAN_STAMP)|g" \
@@ -137,7 +136,7 @@ $(XFS_SCRUB_ALL_PROG): $(XFS_SCRUB_ALL_PROG).in $(builddefs) $(TOPDIR)/libfrog/g
 xfs_scrub_fail: xfs_scrub_fail.in $(builddefs)
        @echo "    [SED]    $@"
        $(Q)$(SED) -e "s|@sbindir@|$(PKG_SBIN_DIR)|g" \
-                  -e "s|@scrub_svcname@|$(scrub_svcname)|g" \
+                  -e "s|@scrub_svcname@|$(XFS_SCRUB_SVCNAME)|g" \
                   -e "s|@pkg_version@|$(PKG_VERSION)|g"  < $< > $@
        $(Q)chmod a+x $@