From 33402483458e1ea76cb457994ff0d8ae1ac169ca Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 29 Jul 2024 16:23:10 -0700 Subject: [PATCH] xfs_scrub: hoist non-rendering character predicate Hoist this predicate code into its own function; we're going to use it elsewhere later on. While we're at it, document how we generated this list in the first place. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- scrub/unicrash.c | 49 +++++++++++++++++++++++++++++++----------------- 1 file changed, 32 insertions(+), 17 deletions(-) diff --git a/scrub/unicrash.c b/scrub/unicrash.c index 456caec27..1a86b5f8c 100644 --- a/scrub/unicrash.c +++ b/scrub/unicrash.c @@ -170,6 +170,36 @@ remove_ignorable( return dest; } +/* + * Certain unicode codepoints are formatting hints that are not themselves + * supposed to be rendered by a display system. These codepoints can be + * encoded in file names to try to confuse users. + * + * Download https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt and + * $ grep -E '(zero width|invisible|joiner|application)' -i UnicodeData.txt + */ +static inline bool is_nonrendering(UChar32 uchr) +{ + switch (uchr) { + case 0x034F: /* combining grapheme joiner */ + case 0x200B: /* zero width space */ + case 0x200C: /* zero width non-joiner */ + case 0x200D: /* zero width joiner */ + case 0x2028: /* line separator */ + case 0x2029: /* paragraph separator */ + case 0x2060: /* word joiner */ + case 0x2061: /* function application */ + case 0x2062: /* invisible times (multiply) */ + case 0x2063: /* invisible separator (comma) */ + case 0x2064: /* invisible plus (addition) */ + case 0x2D7F: /* tifinagh consonant joiner */ + case 0xFEFF: /* zero width non breaking space */ + return true; + } + + return false; +} + /* * Generate normalized form and skeleton of the name. If this fails, just * forget everything and return false; this is an advisory checker. @@ -349,24 +379,9 @@ name_entry_examine( uiter_setString(&uiter, entry->normstr, entry->normstrlen); while ((uchr = uiter_next32(&uiter)) != U_SENTINEL) { - /* zero width character sequences */ - switch (uchr) { - case 0x034F: /* combining grapheme joiner */ - case 0x200B: /* zero width space */ - case 0x200C: /* zero width non-joiner */ - case 0x200D: /* zero width joiner */ - case 0x2028: /* line separator */ - case 0x2029: /* paragraph separator */ - case 0x2060: /* word joiner */ - case 0x2061: /* function application */ - case 0x2062: /* invisible times (multiply) */ - case 0x2063: /* invisible separator (comma) */ - case 0x2064: /* invisible plus (addition) */ - case 0x2D7F: /* tifinagh consonant joiner */ - case 0xFEFF: /* zero width non breaking space */ + /* characters are invisible */ + if (is_nonrendering(uchr)) *badflags |= UNICRASH_ZERO_WIDTH; - break; - } /* control characters */ if (u_iscntrl(uchr)) -- 2.39.5