common: optimize check_utf8

author Yibo Cai <yibo.cai@arm.com>

Fri, 12 Apr 2019 07:51:31 +0000 (15:51 +0800)

committer Yibo Cai <yibo.cai@arm.com>

Wed, 17 Apr 2019 01:46:45 +0000 (09:46 +0800)
author Yibo Cai <yibo.cai@arm.com>
Fri, 12 Apr 2019 07:51:31 +0000 (15:51 +0800)
committer Yibo Cai <yibo.cai@arm.com>
Wed, 17 Apr 2019 01:46:45 +0000 (09:46 +0800)
diff --git a/src/common/utf8.c b/src/common/utf8.c

index 9b7aaf5fdac968531a4a4184f390c13f5c332602..c2d7917f837d837512b4eff6e9c663318770b2d8 100644 (file)
--- a/src/common/utf8.c
+++ b/src/common/utf8.c
@@ -15,6 +15,34 @@
  
  #include <string.h>
  
+/*
+ * http://www.unicode.org/versions/Unicode6.0.0/ch03.pdf - page 94
+ *
+ * Table 3-7. Well-Formed UTF-8 Byte Sequences
+ *
+ * +--------------------+------------+-------------+------------+-------------+
+ * | Code Points        | First Byte | Second Byte | Third Byte | Fourth Byte |
+ * +--------------------+------------+-------------+------------+-------------+
+ * | U+0000..U+007F     | 00..7F     |             |            |             |
+ * +--------------------+------------+-------------+------------+-------------+
+ * | U+0080..U+07FF     | C2..DF     | 80..BF      |            |             |
+ * +--------------------+------------+-------------+------------+-------------+
+ * | U+0800..U+0FFF     | E0         | A0..BF      | 80..BF     |             |
+ * +--------------------+------------+-------------+------------+-------------+
+ * | U+1000..U+CFFF     | E1..EC     | 80..BF      | 80..BF     |             |
+ * +--------------------+------------+-------------+------------+-------------+
+ * | U+D000..U+D7FF     | ED         | 80..9F      | 80..BF     |             |
+ * +--------------------+------------+-------------+------------+-------------+
+ * | U+E000..U+FFFF     | EE..EF     | 80..BF      | 80..BF     |             |
+ * +--------------------+------------+-------------+------------+-------------+
+ * | U+10000..U+3FFFF   | F0         | 90..BF      | 80..BF     | 80..BF      |
+ * +--------------------+------------+-------------+------------+-------------+
+ * | U+40000..U+FFFFF   | F1..F3     | 80..BF      | 80..BF     | 80..BF      |
+ * +--------------------+------------+-------------+------------+-------------+
+ * | U+100000..U+10FFFF | F4         | 80..8F      | 80..BF     | 80..BF      |
+ * +--------------------+------------+-------------+------------+-------------+
+ */
+
  static int high_bits_set(int c)
  {
         int ret = 0;
@@ -107,52 +135,69 @@ unsigned long decode_utf8(unsigned char *buf, int nbytes)
  
  int check_utf8(const char *buf, int len)
  {
-       unsigned char u[MAX_UTF8_SZ];
-       int enc_len = 0;
-       int i = 0;
-       while (1) {
-               unsigned int c = buf[i];
-               if (i >= len || c < 0x80 || (c & 0xC0) != 0x80) {
-                       // the start of a new character. Process what we have
-                       // in the buffer.
-                       if (enc_len > 0) {
-                               int re_encoded_len;
-                               unsigned char re_encoded[MAX_UTF8_SZ];
-                               unsigned long code = decode_utf8(u, enc_len);
-                               if (code == INVALID_UTF8_CHAR) {
-                                       //printf("decoded to invalid utf8");
-                                       return i + 1;
+       /*
+        * "char" is "signed" on x86 but "unsigned" on aarch64 by default.
+        * Below code depends on signed/unsigned comparisons, define an
+        * unsigned buffer explicitly to fix the gap.
+        */
+       const unsigned char *bufu = (const unsigned char *)buf;
+       int err_pos = 1;
+
+       while (len) {
+               int nbytes;
+               unsigned char byte1 = bufu[0];
+
+               /* 00..7F */
+               if (byte1 <= 0x7F) {
+                       nbytes = 1;
+               /* C2..DF, 80..BF */
+               } else if (len >= 2 && byte1 >= 0xC2 && byte1 <= 0xDF &&
+                               (signed char)bufu[1] <= (signed char)0xBF) {
+                       nbytes = 2;
+               } else if (len >= 3) {
+                       unsigned char byte2 = bufu[1];
+
+                       /* Is byte2, byte3 between 0x80 ~ 0xBF */
+                       int byte2_ok = (signed char)byte2 <= (signed char)0xBF;
+                       int byte3_ok = (signed char)bufu[2] <= (signed char)0xBF;
+
+                       if (byte2_ok && byte3_ok &&
+                                       /* E0, A0..BF, 80..BF */
+                                       ((byte1 == 0xE0 && byte2 >= 0xA0) ||
+                                        /* E1..EC, 80..BF, 80..BF */
+                                        (byte1 >= 0xE1 && byte1 <= 0xEC) ||
+                                        /* ED, 80..9F, 80..BF */
+                                        (byte1 == 0xED && byte2 <= 0x9F) ||
+                                        /* EE..EF, 80..BF, 80..BF */
+                                        (byte1 >= 0xEE && byte1 <= 0xEF))) {
+                               nbytes = 3;
+                       } else if (len >= 4) {
+                               /* Is byte4 between 0x80 ~ 0xBF */
+                               int byte4_ok = (signed char)bufu[3] <= (signed char)0xBF;
+
+                               if (byte2_ok && byte3_ok && byte4_ok &&
+                                               /* F0, 90..BF, 80..BF, 80..BF */
+                                               ((byte1 == 0xF0 && byte2 >= 0x90) ||
+                                                /* F1..F3, 80..BF, 80..BF, 80..BF */
+                                                (byte1 >= 0xF1 && byte1 <= 0xF3) ||
+                                                /* F4, 80..8F, 80..BF, 80..BF */
+                                                (byte1 == 0xF4 && byte2 <= 0x8F))) {
+                                       nbytes = 4;
+                               } else {
+                                       return err_pos;
                                 }
-                               re_encoded_len = encode_utf8(code, re_encoded);
-                               if (enc_len != re_encoded_len) {
-                                       //printf("originally encoded as %d bytes, "
-                                       //      "but was re-encoded to %d!\n",
-                                       //      enc_len, re_encoded_len);
-                                       return i + 1;
-                               }
-                               if (memcmp(u, re_encoded, enc_len) != 0) {
-                                       //printf("re-encoded to a different "
-                                       //      "byte stream!");
-                                       return i + 1;
-                               }
-                               //printf("code_point %lu\n", code);
+                       } else {
+                               return err_pos;
                         }
-                       enc_len = 0;
-                       if (i >= len)
-                               break;
-                       // start collecting again?
-                       if (c >= 0x80)
-                               u[enc_len++] = c;
                 } else {
-                       if (enc_len == MAX_UTF8_SZ) {
-                               //printf("too many enc_len in utf character!\n");
-                               return i + 1;
-                       }
-                       //printf("continuation byte...\n");
-                       u[enc_len++] = c;
+                       return err_pos;
                 }
-               ++i;
+
+               len -= nbytes;
+               err_pos += nbytes;
+               bufu += nbytes;
         }
+
         return 0;
  }
author	Yibo Cai <yibo.cai@arm.com>
	Fri, 12 Apr 2019 07:51:31 +0000 (15:51 +0800)
committer	Yibo Cai <yibo.cai@arm.com>
	Wed, 17 Apr 2019 01:46:45 +0000 (09:46 +0800)