From 78691d321d3ced54fc82de86060e5ab0a3944deb Mon Sep 17 00:00:00 2001 From: Yibo Cai Date: Tue, 23 Apr 2019 13:22:31 +0800 Subject: [PATCH] common: optimize encode_utf8 Unroll loop manually to accelerate UTF-8 byte sequence encoding. Achieves 30% to 50% performance gain on x86 and Arm servers. NOTE: Per https://en.wikipedia.org/wiki/UTF-8#Invalid_code_points, since RFC3629(November 2003), code points after U+10FFFF must be treated as invalid UTF-8 byte sequence. But to be compatible with curent code, this implementation still accepts these illegal strings. Signed-off-by: Yibo Cai --- src/common/utf8.c | 63 ++++++++++++++++++++++++++--------------------- 1 file changed, 35 insertions(+), 28 deletions(-) diff --git a/src/common/utf8.c b/src/common/utf8.c index c2d7917f837..3a05789f603 100644 --- a/src/common/utf8.c +++ b/src/common/utf8.c @@ -61,37 +61,44 @@ static int high_bits_set(int c) */ int encode_utf8(unsigned long u, unsigned char *buf) { - static const unsigned long max_val[MAX_UTF8_SZ] = { - 0x0000007ful, 0x000007fful, 0x0000fffful, - 0x001ffffful, 0x03fffffful, 0x7ffffffful - }; - static const int MAX_VAL_SZ = sizeof(max_val)/sizeof(max_val[0]); - - int i; - for (i = 0; i < MAX_VAL_SZ; ++i) { - if (u <= max_val[i]) - break; - } - if (i == MAX_VAL_SZ) { - // This code point is too big to encode. - return -1; - } - - if (i == 0) { + /* Unroll loop for common code points */ + if (u <= 0x0000007F) { buf[0] = u; - } - else { - signed int j; - for (j = i; j > 0; --j) { - buf[j] = 0x80 | (u & 0x3f); - u >>= 6; + return 1; + } else if (u <= 0x000007FF) { + buf[0] = 0xC0 | (u >> 6); + buf[1] = 0x80 | (u & 0x3F); + return 2; + } else if (u <= 0x0000FFFF) { + buf[0] = 0xE0 | (u >> 12); + buf[1] = 0x80 | ((u >> 6) & 0x3F); + buf[2] = 0x80 | (u & 0x3F); + return 3; + } else if (u <= 0x001FFFFF) { + buf[0] = 0xF0 | (u >> 18); + buf[1] = 0x80 | ((u >> 12) & 0x3F); + buf[2] = 0x80 | ((u >> 6) & 0x3F); + buf[3] = 0x80 | (u & 0x3F); + return 4; + } else { + /* Rare/illegal code points */ + if (u <= 0x03FFFFFF) { + for (int i = 4; i >= 1; --i) { + buf[i] = 0x80 | (u & 0x3F); + u >>= 6; + } + buf[0] = 0xF8 | u; + return 5; + } else if (u <= 0x7FFFFFFF) { + for (int i = 5; i >= 1; --i) { + buf[i] = 0x80 | (u & 0x3F); + u >>= 6; + } + buf[0] = 0xFC | u; + return 6; } - - unsigned char mask = ~(0xFF >> (i + 1)); - buf[0] = mask | u; + return -1; } - - return i + 1; } /* -- 2.39.5