From: Darrick J. Wong Date: Sun, 22 Feb 2026 22:41:16 +0000 (-0800) Subject: xfs_healer: run full scrub after lost corruption events or targeted repair failure X-Git-Tag: v7.0.0~40 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=a162dc462186a6ad170abe4637e0d3483621648c;p=xfsprogs-dev.git xfs_healer: run full scrub after lost corruption events or targeted repair failure If we fail to perform a spot repair of metadata or the kernel tells us that it lost corruption events due to queue limits, initiate a full run of the online fsck service to try to fix the error. Signed-off-by: "Darrick J. Wong" Reviewed-by: Christoph Hellwig --- diff --git a/healer/Makefile b/healer/Makefile index 1eeb7276..b8ffce33 100644 --- a/healer/Makefile +++ b/healer/Makefile @@ -19,6 +19,8 @@ xfs_healer.c HFILES = \ xfs_healer.h +CFLAGS+=-DXFS_SCRUB_SVCNAME=\"$(XFS_SCRUB_SVCNAME)\" + LLDLIBS += $(LIBHANDLE) $(LIBFROG) $(LIBURCU) $(LIBPTHREAD) LTDEPENDENCIES += $(LIBHANDLE) $(LIBFROG) LLDFLAGS = -static diff --git a/healer/fsrepair.c b/healer/fsrepair.c index 4534104f..9f8c128e 100644 --- a/healer/fsrepair.c +++ b/healer/fsrepair.c @@ -9,8 +9,14 @@ #include "libfrog/fsgeom.h" #include "libfrog/workqueue.h" #include "libfrog/healthevent.h" +#include "libfrog/systemd.h" #include "xfs_healer.h" +enum what_next { + NEED_FULL_REPAIR, + REPAIR_DONE, +}; + /* Translate scrub output flags to outcome. */ static enum repair_outcome from_repair_oflags(uint32_t oflags) { @@ -61,7 +67,7 @@ xfs_repair_metadata( } /* React to a fs-domain corruption event by repairing it. */ -static void +static enum what_next try_repair_wholefs( struct healer_ctx *ctx, const struct hme_prefix *pfx, @@ -90,11 +96,16 @@ try_repair_wholefs( pthread_mutex_lock(&ctx->conlock); report_health_repair(pfx, hme, f->event_mask, outcome); pthread_mutex_unlock(&ctx->conlock); + + if (outcome == REPAIR_FAILED) + return NEED_FULL_REPAIR; } + + return REPAIR_DONE; } /* React to an ag corruption event by repairing it. */ -static void +static enum what_next try_repair_ag( struct healer_ctx *ctx, const struct hme_prefix *pfx, @@ -126,11 +137,16 @@ try_repair_ag( pthread_mutex_lock(&ctx->conlock); report_health_repair(pfx, hme, f->event_mask, outcome); pthread_mutex_unlock(&ctx->conlock); + + if (outcome == REPAIR_FAILED) + return NEED_FULL_REPAIR; } + + return REPAIR_DONE; } /* React to a rtgroup corruption event by repairing it. */ -static void +static enum what_next try_repair_rtgroup( struct healer_ctx *ctx, const struct hme_prefix *pfx, @@ -157,11 +173,16 @@ try_repair_rtgroup( pthread_mutex_lock(&ctx->conlock); report_health_repair(pfx, hme, f->event_mask, outcome); pthread_mutex_unlock(&ctx->conlock); + + if (outcome == REPAIR_FAILED) + return NEED_FULL_REPAIR; } + + return REPAIR_DONE; } /* React to a inode-domain corruption event by repairing it. */ -static void +static enum what_next try_repair_inode( struct healer_ctx *ctx, const struct hme_prefix *orig_pfx, @@ -204,7 +225,12 @@ try_repair_inode( pthread_mutex_lock(&ctx->conlock); report_health_repair(pfx, hme, f->event_mask, outcome); pthread_mutex_unlock(&ctx->conlock); + + if (outcome == REPAIR_FAILED) + return NEED_FULL_REPAIR; } + + return REPAIR_DONE; } /* Repair a metadata corruption. */ @@ -214,6 +240,7 @@ repair_metadata( const struct hme_prefix *pfx, const struct xfs_health_monitor_event *hme) { + enum what_next what_next; int repair_fd; int ret; @@ -227,19 +254,25 @@ repair_metadata( switch (hme->domain) { case XFS_HEALTH_MONITOR_DOMAIN_FS: - try_repair_wholefs(ctx, pfx, repair_fd, hme); + what_next = try_repair_wholefs(ctx, pfx, repair_fd, hme); break; case XFS_HEALTH_MONITOR_DOMAIN_AG: - try_repair_ag(ctx, pfx, repair_fd, hme); + what_next = try_repair_ag(ctx, pfx, repair_fd, hme); break; case XFS_HEALTH_MONITOR_DOMAIN_RTGROUP: - try_repair_rtgroup(ctx, pfx, repair_fd, hme); + what_next = try_repair_rtgroup(ctx, pfx, repair_fd, hme); break; case XFS_HEALTH_MONITOR_DOMAIN_INODE: - try_repair_inode(ctx, pfx, repair_fd, hme); + what_next = try_repair_inode(ctx, pfx, repair_fd, hme); break; + default: + what_next = REPAIR_DONE; } + /* Transform into a full repair if we failed to fix this item. */ + if (what_next == NEED_FULL_REPAIR) + run_full_repair(ctx); + close(repair_fd); return 0; } @@ -259,3 +292,35 @@ healer_can_repair( ret = ioctl(ctx->mnt.fd, XFS_IOC_SCRUB_METADATA, &sm); return ret ? false : true; } + +/* Run a full repair of the filesystem using the background fsck service. */ +void +run_full_repair( + struct healer_ctx *ctx) +{ + char unitname[PATH_MAX]; + int ret; + + ret = weakhandle_instance_unit_name(ctx->wh, XFS_SCRUB_SVCNAME, + unitname, PATH_MAX); + if (ret) { + fprintf(stderr, "%s: %s\n", ctx->mntpoint, + _("Could not determine xfs_scrub unit name.")); + return; + } + + /* + * Scrub could already be repairing something, so try to start the unit + * and be content if it's already running. + */ + ret = systemd_manage_unit(UM_START, unitname); + if (ret) { + fprintf(stderr, "%s: %s: %s\n", ctx->mntpoint, + _("Could not start xfs_scrub service unit"), + unitname); + return; + } + + printf("%s: %s\n", ctx->mntpoint, _("Full repairs in progress.")); + fflush(stdout); +} diff --git a/healer/weakhandle.c b/healer/weakhandle.c index 8950e0eb..849aa288 100644 --- a/healer/weakhandle.c +++ b/healer/weakhandle.c @@ -13,6 +13,7 @@ #include "libfrog/workqueue.h" #include "libfrog/getparents.h" #include "libfrog/paths.h" +#include "libfrog/systemd.h" #include "xfs_healer.h" struct weakhandle { @@ -199,3 +200,15 @@ weakhandle_getpath_for( close(mnt_fd); return ret; } + +/* Compute the systemd instance unit name for this mountpoint. */ +int +weakhandle_instance_unit_name( + struct weakhandle *wh, + const char *template, + char *unitname, + size_t unitnamelen) +{ + return systemd_path_instance_unit_name(template, wh->mntpoint, + unitname, unitnamelen); +} diff --git a/healer/xfs_healer.c b/healer/xfs_healer.c index 95e952ed..d885eb0a 100644 --- a/healer/xfs_healer.c +++ b/healer/xfs_healer.c @@ -142,6 +142,15 @@ handle_event( pthread_mutex_unlock(&ctx->conlock); } + /* + * If we didn't ask for all the metadata reports (including the healthy + * ones) and the kernel tells us it lost something, run a full repair + * if we're expected to fix things. + */ + if (hme->type == XFS_HEALTH_MONITOR_TYPE_LOST && !ctx->everything && + ctx->want_repair) + run_full_repair(ctx); + /* Initiate a repair if appropriate. */ if (will_repair) repair_metadata(ctx, &pfx, hme); diff --git a/healer/xfs_healer.h b/healer/xfs_healer.h index a2a46053..e1370323 100644 --- a/healer/xfs_healer.h +++ b/healer/xfs_healer.h @@ -72,6 +72,7 @@ void lookup_path(struct healer_ctx *ctx, int repair_metadata(struct healer_ctx *ctx, const struct hme_prefix *pfx, const struct xfs_health_monitor_event *hme); bool healer_can_repair(struct healer_ctx *ctx); +void run_full_repair(struct healer_ctx *ctx); /* weakhandle.c */ int weakhandle_alloc(int fd, const char *mountpoint, const char *fsname, @@ -80,5 +81,7 @@ int weakhandle_reopen(struct weakhandle *wh, int *fd); void weakhandle_free(struct weakhandle **whp); int weakhandle_getpath_for(struct weakhandle *wh, uint64_t ino, uint32_t gen, char *path, size_t pathlen); +int weakhandle_instance_unit_name(struct weakhandle *wh, const char *template, + char *unitname, size_t unitnamelen); #endif /* XFS_HEALER_XFS_HEALER_H_ */ diff --git a/include/builddefs.in b/include/builddefs.in index bdba9cd9..3b52d1af 100644 --- a/include/builddefs.in +++ b/include/builddefs.in @@ -62,6 +62,7 @@ MKFS_CFG_DIR = @datadir@/@pkg_name@/mkfs PKG_STATE_DIR = @localstatedir@/lib/@pkg_name@ XFS_SCRUB_ALL_AUTO_MEDIA_SCAN_STAMP=$(PKG_STATE_DIR)/xfs_scrub_all_media.stamp +XFS_SCRUB_SVCNAME=xfs_scrub@.service CC = @cc@ BUILD_CC = @BUILD_CC@ diff --git a/scrub/Makefile b/scrub/Makefile index ff79a265..aee49bfc 100644 --- a/scrub/Makefile +++ b/scrub/Makefile @@ -8,7 +8,6 @@ include $(builddefs) SCRUB_PREREQS=$(HAVE_GETFSMAP) -scrub_svcname=xfs_scrub@.service scrub_media_svcname=xfs_scrub_media@.service ifeq ($(SCRUB_PREREQS),yes) @@ -21,7 +20,7 @@ XFS_SCRUB_SERVICE_ARGS = -b -o autofsck ifeq ($(HAVE_SYSTEMD),yes) INSTALL_SCRUB += install-systemd SYSTEMD_SERVICES=\ - $(scrub_svcname) \ + $(XFS_SCRUB_SVCNAME) \ xfs_scrub_fail@.service \ $(scrub_media_svcname) \ xfs_scrub_media_fail@.service \ @@ -123,7 +122,7 @@ xfs_scrub_all.timer: xfs_scrub_all.timer.in $(builddefs) $(XFS_SCRUB_ALL_PROG): $(XFS_SCRUB_ALL_PROG).in $(builddefs) $(TOPDIR)/libfrog/gettext.py @echo " [SED] $@" $(Q)$(SED) -e "s|@sbindir@|$(PKG_SBIN_DIR)|g" \ - -e "s|@scrub_svcname@|$(scrub_svcname)|g" \ + -e "s|@scrub_svcname@|$(XFS_SCRUB_SVCNAME)|g" \ -e "s|@scrub_media_svcname@|$(scrub_media_svcname)|g" \ -e "s|@pkg_version@|$(PKG_VERSION)|g" \ -e "s|@stampfile@|$(XFS_SCRUB_ALL_AUTO_MEDIA_SCAN_STAMP)|g" \ @@ -137,7 +136,7 @@ $(XFS_SCRUB_ALL_PROG): $(XFS_SCRUB_ALL_PROG).in $(builddefs) $(TOPDIR)/libfrog/g xfs_scrub_fail: xfs_scrub_fail.in $(builddefs) @echo " [SED] $@" $(Q)$(SED) -e "s|@sbindir@|$(PKG_SBIN_DIR)|g" \ - -e "s|@scrub_svcname@|$(scrub_svcname)|g" \ + -e "s|@scrub_svcname@|$(XFS_SCRUB_SVCNAME)|g" \ -e "s|@pkg_version@|$(PKG_VERSION)|g" < $< > $@ $(Q)chmod a+x $@